@akukral/site-comparator
Version:
A sophisticated website comparison tool with intelligent content analysis and offset-aware difference detection
1,268 lines (1,134 loc) • 80.3 kB
JavaScript
#!/usr/bin/env node
const fs = require('fs').promises;
const path = require('path');
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const crypto = require('crypto');
const { URL } = require('url');
const readline = require('readline');
const readlineSync = require('readline-sync');
/**
* Site Comparator - A tool for comparing websites and detecting differences in content, structure, and functionality.
*
* This class provides comprehensive website comparison capabilities including:
* - Automated page discovery and crawling
* - Content normalization and comparison
* - Authentication handling for protected sites
* - Detailed difference analysis and reporting
* - HTML and JSON report generation
*
* @class Comparator
* @example
* const comparator = new Comparator({ maxPages: 10, delay: 2000 });
* await comparator.compare('https://staging.example.com', 'https://example.com');
*/
class Comparator {
/**
* Creates a new Comparator instance with configurable options.
*
* @param {Object} options - Configuration options for the comparator
* @param {number} [options.maxPages=20] - Maximum number of pages to crawl per site
* @param {number} [options.maxDiscovery=500] - Maximum number of unique links to discover
* @param {number} [options.delay=1000] - Delay between requests in milliseconds
* @param {number} [options.timeout=30000] - Page load timeout in milliseconds
* @param {string[]} [options.ignoreElements=['script', 'noscript', 'style']] - HTML elements to ignore during comparison
* @param {string[]} [options.ignoreAttributes=['data-csrf', 'csrf-token', '_token', 'nonce']] - HTML attributes to ignore during comparison
* @param {string[]} [options.ignoreClasses=['timestamp', 'csrf', 'nonce', 'random']] - CSS classes to ignore during comparison
* @param {string} [options.userAgent='Comparator Bot 1.3.0'] - User agent string for requests
* @param {string} [options.outputDir='./comparator-results'] - Directory to save comparison results
*/
constructor(options = {}) {
this.options = {
maxPages: 20,
maxDiscovery: 500,
delay: 1000,
timeout: 30000,
ignoreElements: ['script', 'noscript', 'style'],
ignoreAttributes: ['data-csrf', 'csrf-token', '_token', 'nonce'],
ignoreClasses: ['timestamp', 'csrf', 'nonce', 'random'],
userAgent: 'Comparator Bot 1.3.0',
outputDir: './comparator-results',
...options
};
this.visited = new Set();
this.results = {
compared: 0,
differences: [],
errors: [],
summary: {}
};
}
/**
* Helper method for waiting - compatible with all Puppeteer versions
*
* This method provides a consistent way to wait for a specified time period
* across different versions of Puppeteer, handling the API changes between
* older versions (waitForTimeout) and newer versions (setTimeout).
*
* @async
* @param {Page} page - Puppeteer page instance
* @param {number} milliseconds - Time to wait in milliseconds
* @returns {Promise<void>} Promise that resolves after the specified delay
* @example
* await comparator.waitFor(page, 2000); // Wait 2 seconds
*/
async waitFor(page, milliseconds) {
if (typeof page.waitForTimeout === 'function') {
// Older Puppeteer versions
return await page.waitForTimeout(milliseconds);
} else {
// Newer Puppeteer versions - use setTimeout
return new Promise(resolve => setTimeout(resolve, milliseconds));
}
}
/**
* Initializes the comparator by creating output directory and launching browser.
*
* This method sets up the environment for website comparison including:
* - Creating the output directory if it doesn't exist
* - Launching a Puppeteer browser instance with optimized settings
* - Configuring browser security and compatibility options
*
* @async
* @throws {Error} If browser launch fails or directory creation fails
* @example
* await comparator.init();
*/
async init() {
// Create output directory
await fs.mkdir(this.options.outputDir, { recursive: true });
// Launch browser with better compatibility options
this.browser = await puppeteer.launch({
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-web-security',
'--allow-running-insecure-content',
'--ignore-certificate-errors',
'--ignore-ssl-errors'
]
});
}
/**
* Cleans up resources by closing the browser instance.
*
* This method should be called when the comparator is no longer needed
* to free up system resources and close browser processes.
*
* @async
* @example
* await comparator.cleanup();
*/
async cleanup() {
if (this.browser) {
await this.browser.close();
}
}
/**
* Retrieves authentication credentials for a domain.
*
* This method attempts to get credentials from multiple sources in order:
* 1. Command line options passed to the compare method
* 2. Environment variables (COMPARATOR_USER_<DOMAIN>, COMPARATOR_PASSWORD_<DOMAIN>)
* 3. Global environment variables (COMPARATOR_USERNAME, COMPARATOR_PASSWORD)
* 4. Interactive prompts for username and password
*
* @async
* @param {string} domain - The domain requiring authentication
* @param {Object} options - Authentication options
* @param {string} [options.username] - Username for authentication
* @param {string} [options.password] - Password for authentication
* @returns {Promise<Object|null>} Authentication object with username and password, or null if no credentials
* @example
* const auth = await comparator.getAuthCredentials('https://protected.example.com');
* if (auth) {
* console.log(`Using credentials for ${auth.username}`);
* }
*/
async getAuthCredentials(domain, options = {}) {
// Check if credentials were provided via command line or environment
if (options.username && options.password) {
return { username: options.username, password: options.password };
}
// Check environment variables
const envUsername = process.env[`COMPARATOR_USER_${this.getDomainKey(domain)}`] || process.env.COMPARATOR_USERNAME;
const envPassword = process.env[`COMPARATOR_PASS_${this.getDomainKey(domain)}`] || process.env.COMPARATOR_PASSWORD;
if (envUsername && envPassword) {
console.log(`Using credentials from environment for ${domain}`);
return { username: envUsername, password: envPassword };
}
return new Promise((resolve) => {
console.log(`\nHTTP Authentication may be required for ${domain}`);
// Use readline-sync for username input
const username = readlineSync.question('Username (press enter to skip): ');
if (!username.trim()) {
resolve(null);
return;
}
// Use readline-sync's built-in password hiding - completely invisible keystrokes
const password = readlineSync.question('Password: ', {
hideEchoBack: true, // This completely hides all keystrokes
mask: '' // No mask character shown
});
resolve({ username: username.trim(), password: password });
});
}
/**
* Helper to create environment variable key from domain
*
* This method converts a domain URL into a valid environment variable key
* by extracting the hostname and replacing non-alphanumeric characters with underscores.
* This is used for domain-specific authentication environment variables.
*
* @param {string} domain - Domain URL to convert
* @returns {string} Environment variable key for the domain
* @example
* const key = comparator.getDomainKey('https://staging.example.com');
* // Returns: 'STAGING_EXAMPLE_COM'
*/
getDomainKey(domain) {
return new URL(domain).hostname.replace(/[^a-zA-Z0-9]/g, '_').toUpperCase();
}
/**
* Creates a new browser page with optional authentication.
*
* This method creates a new page instance with consistent configuration:
* - Sets user agent and viewport dimensions
* - Applies HTTP Basic Authentication if credentials are provided
* - Sets up response handlers for authentication challenges
*
* @async
* @param {Object} [auth=null] - Authentication credentials object
* @param {string} auth.username - Username for HTTP Basic Auth
* @param {string} auth.password - Password for HTTP Basic Auth
* @returns {Promise<Page>} Configured Puppeteer page instance
* @example
* const page = await comparator.createPage({ username: 'user', password: 'pass' });
*/
async createPage(auth = null) {
const page = await this.browser.newPage();
await page.setUserAgent(this.options.userAgent);
await page.setViewport({ width: 1920, height: 1080 });
// Set basic auth if provided
if (auth) {
await page.authenticate(auth);
}
// Handle authentication challenges
page.on('response', async (response) => {
if (response.status() === 401 && auth) {
console.log(`Authentication challenge detected for ${response.url()}`);
}
});
return page;
}
/**
* Normalizes HTML content by removing spurious differences and standardizing formatting.
*
* This method performs several normalization steps to ensure fair comparison:
* - Removes ignored HTML elements (scripts, styles, etc.)
* - Strips ignored attributes (CSRF tokens, nonces, etc.)
* - Filters out ignored CSS classes
* - Normalizes URLs to account for domain differences
* - Removes HTML comments and normalizes whitespace
*
* @param {string} html - Raw HTML content to normalize
* @param {string} baseUrl - Original domain URL for URL normalization
* @param {string} targetUrl - Target domain URL for URL normalization
* @returns {string} Normalized HTML content ready for comparison
* @example
* const normalized = comparator.normalizeContent(html, 'https://site1.com', 'https://site2.com');
*/
normalizeContent(html, baseUrl, targetUrl) {
const $ = cheerio.load(html);
// Remove ignored elements
this.options.ignoreElements.forEach(selector => {
$(selector).remove();
});
// Remove ignored attributes
$('*').each((i, elem) => {
const $elem = $(elem);
this.options.ignoreAttributes.forEach(attr => {
$elem.removeAttr(attr);
});
// Remove ignored classes
const classes = $elem.attr('class');
if (classes) {
const filteredClasses = classes
.split(' ')
.filter(cls => !this.options.ignoreClasses.some(ignore => cls.includes(ignore)))
.join(' ');
if (filteredClasses) {
$elem.attr('class', filteredClasses);
} else {
$elem.removeAttr('class');
}
}
});
// Normalize URLs - replace baseUrl with targetUrl in links and sources
$('a[href], img[src], link[href], script[src]').each((i, elem) => {
const $elem = $(elem);
const attrName = $elem.attr('href') ? 'href' : 'src';
const url = $elem.attr(attrName);
if (url && url.includes(baseUrl)) {
$elem.attr(attrName, url.replace(baseUrl, targetUrl));
}
});
// Remove comments
$('*').contents().filter(function() {
return this.type === 'comment';
}).remove();
// Normalize whitespace
return $.html()
.replace(/\s+/g, ' ')
.replace(/>\s+</g, '><')
.trim();
}
/**
* Extracts structured content from HTML for detailed comparison.
*
* This method parses HTML and extracts key content elements:
* - Page title and headings
* - Paragraph text content
* - Link text and URLs
* - Image alt text and sources
* - Form structure and input fields
*
* @param {string} html - HTML content to extract from
* @returns {Object} Structured content object with extracted elements
* @returns {string} returns.title - Page title text
* @returns {string[]} returns.headings - Array of heading texts
* @returns {string[]} returns.paragraphs - Array of paragraph texts
* @returns {Object[]} returns.links - Array of link objects with text and href
* @returns {Object[]} returns.images - Array of image objects with alt and src
* @returns {Object[]} returns.forms - Array of form objects with action, method, and inputs
* @example
* const content = comparator.extractContent(html);
* console.log(`Page has ${content.headings.length} headings`);
*/
extractContent(html) {
const $ = cheerio.load(html);
return {
title: $('head > title').text().trim(),
headings: $('h1, h2, h3, h4, h5, h6').map((i, el) => $(el).text().trim()).get(),
paragraphs: $('p').map((i, el) => $(el).text().trim()).get().filter(p => p.length > 0),
links: $('a[href]').map((i, el) => ({
text: $(el).text().trim(),
href: $(el).attr('href')
})).get(),
images: $('img[src]').map((i, el) => ({
alt: $(el).attr('alt') || '',
src: $(el).attr('src')
})).get(),
forms: $('form').map((i, el) => ({
action: $(el).attr('action') || '',
method: $(el).attr('method') || 'GET',
inputs: $(el).find('input, textarea, select').map((j, input) => ({
name: $(input).attr('name') || '',
type: $(input).attr('type') || 'text'
})).get()
})).get()
};
}
/**
* Compares two pages and identifies differences in content and structure.
*
* This method performs comprehensive comparison of normalized content:
* - Compares page titles, headings, and paragraphs
* - Analyzes link structures and image content
* - Examines form structures and input fields
* - Uses intelligent matching to detect content reordering
* - Provides detailed difference analysis with snippets
*
* @param {string} url - URL path being compared
* @param {string} content1 - HTML content from first site
* @param {string} content2 - HTML content from second site
* @param {string} domain1 - Base domain of first site
* @param {string} domain2 - Base domain of second site
* @returns {Object} Comparison result object
* @returns {string} returns.url - URL path that was compared
* @returns {boolean} returns.hasDifferences - Whether any differences were found
* @returns {Array} returns.differences - Array of difference objects
* @returns {Object} returns.extracted1 - Extracted content from first site
* @returns {Object} returns.extracted2 - Extracted content from second site
* @example
* const comparison = comparator.comparePage('/about', html1, html2, 'https://site1.com', 'https://site2.com');
* if (comparison.hasDifferences) {
* console.log(`Found ${comparison.differences.length} differences`);
* }
*/
comparePage(url, content1, content2, domain1, domain2) {
const normalized1 = this.normalizeContent(content1, domain1, domain2);
const normalized2 = this.normalizeContent(content2, domain2, domain1);
const extracted1 = this.extractContent(normalized1);
const extracted2 = this.extractContent(normalized2);
const differences = [];
// Compare titles
if (extracted1.title !== extracted2.title) {
differences.push({
type: 'title',
site1: extracted1.title,
site2: extracted2.title,
snippet: this.getSnippet(extracted1.title, extracted2.title)
});
}
// Compare headings with detailed analysis
const headingDiff = this.compareHeadings(extracted1.headings, extracted2.headings);
if (headingDiff.hasDifferences) {
differences.push({
type: 'headings',
site1: extracted1.headings.length,
site2: extracted2.headings.length,
details: headingDiff.details,
snippets: headingDiff.snippets
});
}
// Compare paragraphs with content analysis
const paragraphDiff = this.compareParagraphs(extracted1.paragraphs, extracted2.paragraphs);
if (paragraphDiff.hasDifferences) {
differences.push({
type: 'paragraphs',
site1: extracted1.paragraphs.length,
site2: extracted2.paragraphs.length,
details: paragraphDiff.details,
snippets: paragraphDiff.snippets
});
}
// Compare links with detailed analysis
const linkDiff = this.compareLinks(extracted1.links, extracted2.links);
if (linkDiff.hasDifferences) {
differences.push({
type: 'links',
site1: extracted1.links.length,
site2: extracted2.links.length,
details: linkDiff.details,
snippets: linkDiff.snippets
});
}
// Compare images
const imageDiff = this.compareImages(extracted1.images, extracted2.images);
if (imageDiff.hasDifferences) {
differences.push({
type: 'images',
site1: extracted1.images.length,
site2: extracted2.images.length,
details: imageDiff.details,
snippets: imageDiff.snippets
});
}
// Compare forms
const formDiff = this.compareForms(extracted1.forms, extracted2.forms);
if (formDiff.hasDifferences) {
differences.push({
type: 'forms',
site1: extracted1.forms.length,
site2: extracted2.forms.length,
details: formDiff.details,
snippets: formDiff.snippets
});
}
return {
url,
hasDifferences: differences.length > 0,
differences,
extracted1,
extracted2
};
}
/**
* Helper method to create snippets for differences
*
* This method creates truncated text snippets for displaying differences
* in reports, ensuring that long text content is displayed in a readable format.
*
* @param {string} text1 - Text from first site
* @param {string} text2 - Text from second site
* @param {number} [maxLength=100] - Maximum length for each snippet
* @returns {Object} Object containing truncated snippets for both sites
* @returns {string} returns.site1 - Truncated text from first site
* @returns {string} returns.site2 - Truncated text from second site
* @example
* const snippet = comparator.getSnippet('Very long text content...', 'Different long content...', 50);
* console.log(snippet.site1); // "Very long text content..."
*/
getSnippet(text1, text2, maxLength = 100) {
const truncate = (text) => {
if (text.length <= maxLength) return text;
return text.substring(0, maxLength) + '...';
};
return {
site1: truncate(text1),
site2: truncate(text2)
};
}
/**
* Intelligent content difference detection that accounts for offsets
*
* This method performs sophisticated content comparison that can detect:
* - Content additions and deletions
* - Content reordering
* - Position-independent differences
*
* It uses the longest common subsequence algorithm to identify reordering
* and provides detailed analysis of content changes.
*
* @param {Array} array1 - First array of content items
* @param {Array} array2 - Second array of content items
* @param {string} [contentType='content'] - Type of content being compared
* @returns {Object} Detailed difference analysis
* @returns {Array} returns.differences - Array of difference objects
* @returns {Array} returns.additions - Array of addition objects
* @returns {Array} returns.deletions - Array of deletion objects
* @returns {number} returns.matches - Number of matching items
* @returns {boolean} returns.reordered - Whether content appears to be reordered
* @example
* const analysis = comparator.findContentDifferences(headings1, headings2, 'heading');
* if (analysis.reordered) {
* console.log('Content appears to be reordered');
* }
*/
findContentDifferences(array1, array2, contentType = 'content') {
const differences = [];
const additions = [];
const deletions = [];
// Normalize arrays for comparison
const normalized1 = array1.map(item => this.normalizeForComparison(item));
const normalized2 = array2.map(item => this.normalizeForComparison(item));
// Find the longest common subsequence to identify reordering
const lcs = this.findLongestCommonSubsequence(normalized1, normalized2);
// Create maps for efficient lookup
const map1 = new Map();
const map2 = new Map();
// Build maps with normalized content as key and original items as values
array1.forEach((item, index) => {
const key = normalized1[index];
if (!map1.has(key)) {
map1.set(key, []);
}
map1.get(key).push(item);
});
array2.forEach((item, index) => {
const key = normalized2[index];
if (!map2.has(key)) {
map2.set(key, []);
}
map2.get(key).push(item);
});
// Find items that exist in both arrays (matches)
const matches = new Set();
for (const [key] of map1) {
if (map2.has(key)) {
matches.add(key);
}
}
// Find additions (items only in array2)
for (const [key, items2] of map2) {
if (!matches.has(key)) {
additions.push({
type: 'addition',
contentType,
items: items2,
count: items2.length,
snippet: this.getSnippet('', items2[0], 150)
});
}
}
// Find deletions (items only in array1)
for (const [key, items1] of map1) {
if (!matches.has(key)) {
deletions.push({
type: 'deletion',
contentType,
items: items1,
count: items1.length,
snippet: this.getSnippet(items1[0], '', 150)
});
}
}
// Check for reordering (items that exist in both but in different positions)
if (lcs.length < Math.min(normalized1.length, normalized2.length)) {
const reorderedCount = Math.min(normalized1.length, normalized2.length) - lcs.length;
if (reorderedCount > 0) {
differences.push({
type: 'reordering',
contentType,
count: reorderedCount,
description: `${reorderedCount} ${contentType}s appear to be reordered`
});
}
}
return {
differences: differences,
additions: additions,
deletions: deletions,
matches: matches.size,
reordered: lcs.length < Math.min(normalized1.length, normalized2.length)
};
}
/**
* Find longest common subsequence to detect reordering
*
* This method implements the longest common subsequence (LCS) algorithm
* to identify how much content has been reordered between two arrays.
* It's used to distinguish between actual content changes and simple reordering.
*
* @param {Array} arr1 - First array
* @param {Array} arr2 - Second array
* @returns {Array} Longest common subsequence between the arrays
* @example
* const lcs = comparator.findLongestCommonSubsequence(array1, array2);
* const reorderedCount = Math.min(array1.length, array2.length) - lcs.length;
*/
findLongestCommonSubsequence(arr1, arr2) {
const m = arr1.length;
const n = arr2.length;
const dp = Array(m + 1).fill().map(() => Array(n + 1).fill(0));
// Build LCS matrix
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (arr1[i - 1] === arr2[j - 1]) {
dp[i][j] = dp[i - 1][j - 1] + 1;
} else {
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
}
}
}
// Reconstruct the LCS
const lcs = [];
let i = m, j = n;
while (i > 0 && j > 0) {
if (arr1[i - 1] === arr2[j - 1]) {
lcs.unshift(arr1[i - 1]);
i--;
j--;
} else if (dp[i - 1][j] > dp[i][j - 1]) {
i--;
} else {
j--;
}
}
return lcs;
}
/**
* Normalize content for comparison (remove extra whitespace, etc.)
*
* This method standardizes text content for fair comparison by:
* - Converting to lowercase
* - Trimming whitespace
* - Normalizing multiple spaces to single spaces
*
* @param {*} text - Text content to normalize (converted to string if not already)
* @returns {string} Normalized text content
* @example
* const normalized = comparator.normalizeForComparison(' Hello World ');
* // Returns: "hello world"
*/
normalizeForComparison(text) {
if (typeof text !== 'string') {
text = String(text);
}
return text
.trim()
.replace(/\s+/g, ' ')
.toLowerCase();
}
/**
* Compare headings with detailed analysis
*
* This method compares heading structures between two sites and provides
* detailed analysis of differences including additions, deletions, and reordering.
*
* @param {string[]} headings1 - Headings from first site
* @param {string[]} headings2 - Headings from second site
* @returns {Object} Heading comparison result
* @returns {boolean} returns.hasDifferences - Whether any differences were found
* @returns {string[]} returns.details - Array of difference descriptions
* @returns {Array} returns.snippets - Array of difference snippets (limited to 5)
* @example
* const result = comparator.compareHeadings(site1Headings, site2Headings);
* if (result.hasDifferences) {
* console.log('Heading differences:', result.details.join(', '));
* }
*/
compareHeadings(headings1, headings2) {
const details = [];
const snippets = [];
if (headings1.length !== headings2.length) {
details.push(`Different number of headings: ${headings1.length} vs ${headings2.length}`);
}
// Use intelligent matching to find actual differences, not just position shifts
const { differences, additions, deletions, reordered } = this.findContentDifferences(headings1, headings2, 'heading');
if (differences.length > 0) {
details.push(`${differences.length} headings have different content`);
snippets.push(...differences.slice(0, 3));
}
if (additions.length > 0) {
details.push(`${additions.length} headings added`);
snippets.push(...additions.slice(0, 2));
}
if (deletions.length > 0) {
details.push(`${deletions.length} headings removed`);
snippets.push(...deletions.slice(0, 2));
}
if (reordered) {
details.push(`Headings appear to be reordered`);
}
return {
hasDifferences: details.length > 0,
details,
snippets: snippets.slice(0, 5) // Limit to first 5 differences
};
}
/**
* Compare paragraphs with content analysis
*
* This method compares paragraph content between two sites and provides
* detailed analysis of differences including additions, deletions, and reordering.
*
* @param {string[]} paragraphs1 - Paragraphs from first site
* @param {string[]} paragraphs2 - Paragraphs from second site
* @returns {Object} Paragraph comparison result
* @returns {boolean} returns.hasDifferences - Whether any differences were found
* @returns {string[]} returns.details - Array of difference descriptions
* @returns {Array} returns.snippets - Array of difference snippets (limited to 3)
* @example
* const result = comparator.compareParagraphs(site1Paragraphs, site2Paragraphs);
* if (result.hasDifferences) {
* console.log('Paragraph differences:', result.details.join(', '));
* }
*/
compareParagraphs(paragraphs1, paragraphs2) {
const details = [];
const snippets = [];
if (paragraphs1.length !== paragraphs2.length) {
details.push(`Different number of paragraphs: ${paragraphs1.length} vs ${paragraphs2.length}`);
}
// Use intelligent matching to find actual differences, not just position shifts
const { differences, additions, deletions, reordered } = this.findContentDifferences(paragraphs1, paragraphs2, 'paragraph');
if (differences.length > 0) {
details.push(`${differences.length} paragraphs have different content`);
snippets.push(...differences.slice(0, 3));
}
if (additions.length > 0) {
details.push(`${additions.length} paragraphs added`);
snippets.push(...additions.slice(0, 2));
}
if (deletions.length > 0) {
details.push(`${deletions.length} paragraphs removed`);
snippets.push(...deletions.slice(0, 2));
}
if (reordered) {
details.push(`Paragraphs appear to be reordered`);
}
return {
hasDifferences: details.length > 0,
details,
snippets: snippets.slice(0, 3) // Limit to first 3 differences
};
}
/**
* Compare links with detailed analysis
*
* This method compares link structures between two sites and provides
* detailed analysis of differences including additions, deletions, and reordering.
*
* @param {Object[]} links1 - Links from first site
* @param {Object[]} links2 - Links from second site
* @returns {Object} Link comparison result
* @returns {boolean} returns.hasDifferences - Whether any differences were found
* @returns {string[]} returns.details - Array of difference descriptions
* @returns {Array} returns.snippets - Array of difference snippets
* @example
* const result = comparator.compareLinks(site1Links, site2Links);
* if (result.hasDifferences) {
* console.log('Link differences:', result.details.join(', '));
* }
*/
compareLinks(links1, links2) {
const details = [];
const snippets = [];
if (links1.length !== links2.length) {
details.push(`Different number of links: ${links1.length} vs ${links2.length}`);
}
// Extract link texts for comparison
const texts1 = links1.map(l => l.text).filter(t => t.length > 0);
const texts2 = links2.map(l => l.text).filter(t => t.length > 0);
// Use intelligent matching to find actual differences
const { differences, additions, deletions, reordered } = this.findContentDifferences(texts1, texts2, 'link');
if (differences.length > 0) {
details.push(`${differences.length} links have different text`);
snippets.push(...differences.slice(0, 3));
}
if (additions.length > 0) {
details.push(`${additions.length} links added`);
snippets.push(...additions.slice(0, 2));
}
if (deletions.length > 0) {
details.push(`${deletions.length} links removed`);
snippets.push(...deletions.slice(0, 2));
}
if (reordered) {
details.push(`Links appear to be reordered`);
}
return {
hasDifferences: details.length > 0,
details,
snippets
};
}
/**
* Compare images with detailed analysis
*
* This method compares image content between two sites and provides
* detailed analysis of differences including counts, alt text, and sources.
*
* @param {Object[]} images1 - Images from first site
* @param {Object[]} images2 - Images from second site
* @returns {Object} Image comparison result
* @returns {boolean} returns.hasDifferences - Whether any differences were found
* @returns {string[]} returns.details - Array of difference descriptions
* @returns {Array} returns.snippets - Array of difference snippets
* @example
* const result = comparator.compareImages(site1Images, site2Images);
* if (result.hasDifferences) {
* console.log('Image differences:', result.details.join(', '));
* }
*/
compareImages(images1, images2) {
const details = [];
const snippets = [];
if (images1.length !== images2.length) {
details.push(`Different number of images: ${images1.length} vs ${images2.length}`);
}
// Find images with missing alt text
const missingAlt1 = images1.filter(img => !img.alt || img.alt.trim() === '').length;
const missingAlt2 = images2.filter(img => !img.alt || img.alt.trim() === '').length;
if (missingAlt1 !== missingAlt2) {
details.push(`Different number of images without alt text: ${missingAlt1} vs ${missingAlt2}`);
}
// Find different image sources
const srcs1 = images1.map(img => img.src).filter(src => src);
const srcs2 = images2.map(img => img.src).filter(src => src);
const uniqueSrcs1 = srcs1.filter(src => !srcs2.includes(src));
const uniqueSrcs2 = srcs2.filter(src => !srcs1.includes(src));
if (uniqueSrcs1.length > 0 || uniqueSrcs2.length > 0) {
details.push(`Different image sources found`);
if (uniqueSrcs1.length > 0) {
snippets.push({
type: 'images_only_in_site1',
count: uniqueSrcs1.length,
examples: uniqueSrcs1.slice(0, 2)
});
}
if (uniqueSrcs2.length > 0) {
snippets.push({
type: 'images_only_in_site2',
count: uniqueSrcs2.length,
examples: uniqueSrcs2.slice(0, 2)
});
}
}
return {
hasDifferences: details.length > 0,
details,
snippets
};
}
/**
* Compare forms with detailed analysis
*
* This method compares form structures between two sites and provides
* detailed analysis of differences including actions, methods, and input fields.
*
* @param {Object[]} forms1 - Forms from first site
* @param {Object[]} forms2 - Forms from second site
* @returns {Object} Form comparison result
* @returns {boolean} returns.hasDifferences - Whether any differences were found
* @returns {string[]} returns.details - Array of difference descriptions
* @returns {Array} returns.snippets - Array of difference snippets
* @example
* const result = comparator.compareForms(site1Forms, site2Forms);
* if (result.hasDifferences) {
* console.log('Form differences:', result.details.join(', '));
* }
*/
compareForms(forms1, forms2) {
const details = [];
const snippets = [];
if (forms1.length !== forms2.length) {
details.push(`Different number of forms: ${forms1.length} vs ${forms2.length}`);
}
// Compare form actions and methods
const actions1 = forms1.map(f => f.action).filter(a => a);
const actions2 = forms2.map(f => f.action).filter(a => a);
if (actions1.length !== actions2.length) {
details.push(`Different number of form actions: ${actions1.length} vs ${actions2.length}`);
}
// Compare input counts
const totalInputs1 = forms1.reduce((sum, form) => sum + form.inputs.length, 0);
const totalInputs2 = forms2.reduce((sum, form) => sum + form.inputs.length, 0);
if (totalInputs1 !== totalInputs2) {
details.push(`Different total input fields: ${totalInputs1} vs ${totalInputs2}`);
}
return {
hasDifferences: details.length > 0,
details,
snippets
};
}
/**
* Generate summary of difference types found
*
* This method analyzes all differences found during comparison and provides
* a count of each type of difference (titles, headings, paragraphs, etc.).
*
* @returns {Object} Object with counts for each difference type
* @example
* const summary = comparator.getDifferenceTypeSummary();
* console.log(`Found ${summary.title || 0} title differences`);
* console.log(`Found ${summary.headings || 0} heading differences`);
*/
getDifferenceTypeSummary() {
const typeCounts = {};
this.results.differences.forEach(diff => {
diff.differences.forEach(d => {
typeCounts[d.type] = (typeCounts[d.type] || 0) + 1;
});
});
return typeCounts;
}
/**
* Get the most significant differences for quick overview
*
* This method identifies the most important differences found during comparison
* based on criteria like title changes, content with snippets, and large count differences.
* It's useful for providing a quick summary of the most impactful changes.
*
* @returns {Array} Array of significant difference objects (limited to top 5)
* @example
* const significant = comparator.getSignificantDifferences();
* console.log(`Found ${significant.length} significant differences`);
*/
getSignificantDifferences() {
const significant = [];
this.results.differences.forEach(diff => {
const significantDiffs = diff.differences.filter(d => {
// Consider title differences as most significant
if (d.type === 'title') return true;
// Consider content differences with snippets as significant
if (d.snippets && d.snippets.length > 0) return true;
// Consider large count differences as significant
if (Math.abs(d.site1 - d.site2) > 2) return true;
return false;
});
if (significantDiffs.length > 0) {
significant.push({
url: diff.url,
differences: significantDiffs
});
}
});
return significant.slice(0, 5); // Return top 5 most significant
}
/**
* Get a summary of offset analysis results
*
* This method provides a comprehensive summary of content changes including:
* - Pages with additions, deletions, and reordering
* - Total counts of content changes
* - Breakdown by content type
*
* @returns {Object} Summary of content change analysis
* @returns {number} returns.totalPages - Total pages analyzed
* @returns {number} returns.pagesWithAdditions - Pages with content additions
* @returns {number} returns.pagesWithDeletions - Pages with content deletions
* @returns {number} returns.pagesWithReordering - Pages with content reordering
* @returns {number} returns.totalAdditions - Total content items added
* @returns {number} returns.totalDeletions - Total content items removed
* @returns {Object} returns.contentTypes - Breakdown by content type
* @example
* const analysis = comparator.getOffsetAnalysisSummary();
* console.log(`${analysis.totalAdditions} items were added across all pages`);
*/
getOffsetAnalysisSummary() {
const summary = {
totalPages: this.results.differences.length,
pagesWithAdditions: 0,
pagesWithDeletions: 0,
pagesWithReordering: 0,
totalAdditions: 0,
totalDeletions: 0,
contentTypes: {}
};
this.results.differences.forEach(diff => {
let hasAdditions = false;
let hasDeletions = false;
let hasReordering = false;
diff.differences.forEach(d => {
if (d.snippets) {
d.snippets.forEach(snippet => {
if (snippet.type === 'addition') {
hasAdditions = true;
summary.totalAdditions += snippet.count;
summary.contentTypes[snippet.contentType] = (summary.contentTypes[snippet.contentType] || 0) + snippet.count;
} else if (snippet.type === 'deletion') {
hasDeletions = true;
summary.totalDeletions += snippet.count;
summary.contentTypes[snippet.contentType] = (summary.contentTypes[snippet.contentType] || 0) + snippet.count;
} else if (snippet.type === 'reordering') {
hasReordering = true;
}
});
}
});
if (hasAdditions) summary.pagesWithAdditions++;
if (hasDeletions) summary.pagesWithDeletions++;
if (hasReordering) summary.pagesWithReordering++;
});
return summary;
}
/**
* Crawls a single page to extract content and discover links.
*
* This method handles the process of visiting a page:
* - Navigates to the specified URL with proper waiting
* - Extracts the page content and status
* - Discovers internal links for further crawling
* - Handles errors gracefully and provides detailed error information
*
* @async
* @param {Page} page - Puppeteer page instance to use for crawling
* @param {string} url - URL to crawl
* @param {Object} [auth=null] - Authentication credentials if needed
* @returns {Promise<Object>} Crawl result object
* @returns {string|null} returns.content - HTML content of the page, or null if failed
* @returns {string[]} returns.links - Array of discovered internal links
* @returns {number} returns.status - HTTP status code of the response
* @returns {string} [returns.error] - Error message if crawling failed
* @example
* const result = await comparator.crawlPage(page, 'https://example.com/page');
* if (result.content) {
* console.log(`Found ${result.links.length} links on page`);
* }
*/
async crawlPage(page, url, auth = null) {
try {
console.log(`Crawling: ${url}`);
const response = await page.goto(url, {
waitUntil: 'networkidle2',
timeout: this.options.timeout
});
if (!response.ok()) {
throw new Error(`HTTP ${response.status()}: ${response.statusText()}`);
}
// Wait for dynamic content - compatible with all Puppeteer versions
await this.waitFor(page, this.options.delay);
const content = await page.content();
// Get links with error handling
let links = [];
try {
links = await page.$$eval('a[href]', anchors =>
anchors.map(a => a.href).filter(href => href && href.startsWith('http'))
);
console.log(` Found ${links.length} links on ${url}`);
} catch (linkError) {
console.warn(`Could not extract links from ${url}:`, linkError.message);
}
return { content, links, status: response.status() };
} catch (error) {
console.error(`Error crawling ${url}:`, error.message);
return { content: null, links: [], error: error.message };
}
}
/**
* Discovers pages on a domain by crawling and following internal links.
*
* This method implements intelligent page discovery:
* - Starts from the root domain and follows internal links
* - Respects maximum page and discovery limits
* - Avoids duplicate visits and external links
* - Provides progress updates during discovery
* - Handles authentication challenges automatically
*
* @async
* @param {string} domain - Domain to discover pages from
* @param {Object} [auth=null] - Authentication credentials if needed
* @returns {Promise<Map>} Map of discovered pages with their content and links
* @example
* const pages = await comparator.discoverPages('https://example.com');
* console.log(`Discovered ${pages.size} pages`);
*/
async discoverPages(domain, auth = null) {
const page = await this.createPage(auth);
const discovered = new Set([domain]);
const toVisit = [domain];
const pages = new Map();
console.log(`Discovering pages from ${domain}...`);
while (toVisit.length > 0 && pages.size < this.options.maxPages) {
const url = toVisit.shift();
if (this.visited.has(url)) continue;
this.visited.add(url);
const result = await this.crawlPage(page, url, auth);
pages.set(url, result);
if (result.content && result.links && result.links.length > 0) {
// Add internal links to discovery queue
result.links.forEach(link => {
try {
const linkUrl = new URL(link);
const baseUrl = new URL(domain);
// Only add internal links that we haven't discovered yet
if (linkUrl.hostname === baseUrl.hostname &&
!discovered.has(link) &&
!link.includes('#') && // Skip anchor links
!link.includes('?') && // Skip query parameters for now
pages.size < this.options.maxPages &&
discovered.size < this.options.maxDiscovery) {
discovered.add(link);
toVisit.push(link);
console.log(` Adding to queue: ${link}`);
}
} catch (e) {
// Invalid URL, skip silently
}
});
}
// Show progress
if (pages.size % 5 === 0) {
console.log(` Crawled ${pages.size} pages from ${domain}`);
}
}
await page.close();
console.log(`✅ Finished compairing ${pages.size} pages from ${domain}`);
return pages;
}
/**
* Main comparison function that orchestrates the entire comparison process.
*
* This is the primary method for comparing two websites:
* - Initializes the comparator and browser
* - Handles authentication for both sites
* - Discovers pages on both sites
* - Compares common pages for differences
* - Generates comprehensive reports
* - Saves results in multiple formats
*
* @async
* @param {string} domain1 - First domain to compare
* @param {string} domain2 - Second domain to compare
* @param {Object} [authOptions={}] - Authentication options for both sites
* @param {Object} [authOptions.site1] - Authentication for first site
* @param {Object} [authOptions.site2] - Authentication for second site
* @throws {Error} If comparison fails or authentication is invalid
* @example
* try {
* await comparator.compare('https://staging.example.com', 'https://example.com', {
* site1: { username: 'user1', password: 'pass1' },
* site2: { username: 'user2', password: 'pass2' }
* });
* console.log('Comparison completed successfully');
* } catch (error) {
* console.error('Comparison failed:', error.message);
* }
*/
async compare(domain1, domain2,