UNPKG

web-page-analyzer-cli

Version:

一个强大的网站链接抓取工具,支持深度抓取、认证和页面分析

826 lines (699 loc) 27.2 kB
#!/usr/bin/env node const { chromium } = require('playwright'); const fs = require('fs'); const url = require('url'); const path = require('path'); // 全局配置 const CONFIG = { // 默认的日志错误检测关键字 (如果未通过参数传入) defaultLogKeywords: [ 'error', 'exception', 'failed', 'runtime', 'application error', 'crash', 'timeout', 'connection refused', 'not found', 'unauthorized', 'forbidden', 'internal server error', 'bad request', 'service unavailable' ], // 默认的页面问题检测关键字 (如果未通过参数传入) defaultPageKeywords: [ 'not found', '404', 'page not found', 'error', 'exception', 'runtime error', 'application error', 'client-side exception', 'failed to load', 'loading error', 'network error' ], // 默认分析深度 defaultDepth: 1, // 默认并发数 defaultConcurrency: 8, // 页面超时时间(毫秒) pageTimeout: 15000, // 页面加载等待超时时间(毫秒) loadTimeout: 7000 }; // 初始化配置 - 将配置应用到全局变量 function initializeConfig() { console.log('配置初始化完成'); } // 浏览器管理器 - 用于复用浏览器实例和并发控制 class BrowserManager { constructor(maxConcurrency = CONFIG.defaultConcurrency, options = {}) { this.maxConcurrency = maxConcurrency; this.browser = null; this.activePages = 0; this.queue = []; this.options = options; // 包含 headless, proxy, authOptions, targetUrl 等 this.isShutdown = false; } async getBrowser() { if (!this.browser && !this.isShutdown) { const browserOptions = { headless: this.options.headless !== false }; if (this.options.proxy) { browserOptions.proxy = { server: this.options.proxy }; } this.browser = await chromium.launch(browserOptions); } return this.browser; } async createPage() { const browser = await this.getBrowser(); if (!browser) { throw new Error('浏览器未初始化'); } // *** CORRECTED LOGIC START: 准备页面创建选项 *** const pageOptions = {}; // 从 this.options 中安全地获取 authOptions const authOpts = this.options.authOptions || {}; const { headers = {}, cookies = [], localstorage = [] } = authOpts; // 确保 targetUrl 存在 if (!this.options.targetUrl) { throw new Error("BrowserManager 需要 targetUrl 来设置认证信息。"); } const targetOrigin = new URL(this.options.targetUrl).origin; const targetDomain = new URL(this.options.targetUrl).hostname; // 1. 设置 Headers if (Object.keys(headers).length > 0) { pageOptions.extraHTTPHeaders = headers; } const storageState = { cookies: [], origins: [] }; // 2. 设置 Cookies if (cookies.length > 0) { storageState.cookies = cookies.map(c => ({ ...c, domain: c.domain || targetDomain, // 允许cookie自带domain,否则使用目标domain path: c.path || '/' })); } // 3. 设置 LocalStorage if (localstorage.length > 0) { storageState.origins.push({ origin: targetOrigin, localStorage: localstorage }); } if (storageState.cookies.length > 0 || storageState.origins.length > 0) { pageOptions.storageState = storageState; } this.activePages++; // 使用 pageOptions 创建页面,保证认证信息在页面创建时就绪 const page = await browser.newPage(pageOptions); page.setDefaultTimeout(CONFIG.pageTimeout); return page; } async closePage(page) { if (page && !page.isClosed()) { await page.close(); } this.activePages = Math.max(0, this.activePages - 1); } async shutdown() { this.isShutdown = true; if (this.browser) { try { await this.browser.close(); } catch (error) { console.error('关闭浏览器时出错:', error.message); } this.browser = null; } } async waitForSlot() { while (this.activePages >= this.maxConcurrency) { await new Promise(resolve => setTimeout(resolve, 100)); } } } // 检查是否需要显示帮助信息 if (process.argv.includes('--help') || process.argv.includes('-h')) { console.log(` Web Page Analyzer - 网站页面分析工具 (精简版) 使用方法: web-page-analyzer <URL> [选项] 参数: URL 要分析的目标网站URL。在所有模式下都必须提供。 选项: --main_page_url=URL 额外指定要分析的URL路径。可多次使用。 如果使用此选项,脚本会进入“直接分析”模式。 --page_keyword=KEYWORD 在页面内容中搜索的关键字。可多次使用。 --log_keyword=KEYWORD 在浏览器日志中搜索的关键字。可多次使用。 --depth=N 分析深度 (仅在爬取模式下生效),默认为1 --ui 启用浏览器UI界面(非无头模式) --sequential 使用顺序模式(禁用并发) --concurrency=N 设置并发数量,默认为8 --proxy=server 设置代理服务器(如:http://127.0.0.1:10809) --output-dir=dir 指定输出目录,默认为./output --cookie="name=value" 设置Cookie。可多次使用。 --header="name:value" 设置HTTP头。可多次使用。 --localstorage="key=value" 在页面加载前设置LocalStorage。可多次使用。 --token=your-token 设置Bearer Token (等同于 --header="Authorization: Bearer your-token") --auth=username:password 设置Basic Auth (等同于 --header="Authorization: Basic ...") --skip_url_keyword=keyword 跳过路径中包含关键词的URL(可多次使用) 模式说明: - 爬取模式 (默认): 仅提供一个URL,脚本会从该URL开始爬取。 示例: web-page-analyzer "https://example.com" --depth=1 - 直接分析模式: 提供一个URL,并使用一个或多个 --main_page_url。 脚本将分析主URL和所有指定的额外URL。 示例: web-page-analyzer https://example.com/dashboard --main_page_url=/login (这将分析 /dashboard 和 /login 两个页面) `); process.exit(0); } // 解析命令行参数 function parseArguments() { const args = process.argv.slice(2); const options = { targetUrl: null, maxDepth: CONFIG.defaultDepth, headless: true, authOptions: { headers: {}, cookies: [], localstorage: [], }, proxy: null, concurrent: true, maxConcurrency: CONFIG.defaultConcurrency, outputDir: './output', skipUrlKeywords: [], mainPageUrls: [], pageKeywords: [], logKeywords: [], }; let urlProvided = false; for (let i = 0; i < args.length; i++) { const arg = args[i]; if (!arg.startsWith('--') && !arg.startsWith('-')) { if (!urlProvided) { options.targetUrl = arg; urlProvided = true; } continue; } if (arg.startsWith('--depth=')) { options.maxDepth = parseInt(arg.substring(8), 10) || CONFIG.defaultDepth; } else if (arg === '--ui') { options.headless = false; } else if (arg === '--sequential') { options.concurrent = false; } else if (arg.startsWith('--concurrency=')) { options.maxConcurrency = parseInt(arg.substring(14), 10) || CONFIG.defaultConcurrency; } else if (arg.startsWith('--cookie=')) { const cookieStr = arg.substring(9); const [name, ...valueParts] = cookieStr.split('='); const value = valueParts.join('='); if (name && value) { options.authOptions.cookies.push({ name: name.trim(), value: value.trim() }); } } else if (arg.startsWith('--header=')) { const headerStr = arg.substring(9); const [name, ...valueParts] = headerStr.split(':'); const value = valueParts.join(':'); if (name && value) { // 合并headers而不是覆盖 options.authOptions.headers[name.trim()] = value.trim(); } } else if (arg.startsWith('--localstorage=')) { const lsStr = arg.substring(15); const [key, ...valueParts] = lsStr.split('='); const value = valueParts.join('='); if (key && value) { options.authOptions.localstorage.push({ name: key.trim(), value: value.trim() }); } } else if (arg.startsWith('--token=')) { options.authOptions.headers['Authorization'] = `Bearer ${arg.substring(8)}`; } else if (arg.startsWith('--auth=')) { const [username, password] = arg.substring(7).split(':'); if (username && password) { options.authOptions.headers['Authorization'] = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`; } } else if (arg.startsWith('--proxy=')) { options.proxy = arg.substring(8); } else if (arg.startsWith('--output-dir=')) { options.outputDir = arg.substring(13); } else if (arg.startsWith('--skip_url_keyword=')) { options.skipUrlKeywords.push(arg.substring(19)); } else if (arg.startsWith('--main_page_url=')) { options.mainPageUrls.push(arg.substring(16)); } else if (arg.startsWith('--page_keyword=')) { options.pageKeywords.push(arg.substring(15)); } else if (arg.startsWith('--log_keyword=')) { options.logKeywords.push(arg.substring(14)); } } if (!options.targetUrl) { console.error('错误: 必须提供一个基础URL。'); console.error('示例 (爬取模式): web-page-analyzer https://example.com'); console.error('示例 (直接分析): web-page-analyzer https://example.com --main_page_url=/login'); process.exit(1); } try { new URL(options.targetUrl); } catch (error) { console.error(`错误: 提供的基础URL格式无效: ${options.targetUrl}`); process.exit(1); } if (options.pageKeywords.length === 0) { options.pageKeywords = CONFIG.defaultPageKeywords; console.log(`未提供页面关键字,使用默认值: ${options.pageKeywords.join(', ')}`); } if (options.logKeywords.length === 0) { options.logKeywords = CONFIG.defaultLogKeywords; console.log(`未提供日志关键字,使用默认值: ${options.logKeywords.join(', ')}`); } return options; } const config = parseArguments(); const { targetUrl, maxDepth, headless, authOptions, proxy, outputDir, concurrent, maxConcurrency, skipUrlKeywords, mainPageUrls, pageKeywords, logKeywords } = config; const visitedUrls = new Set(); const allLinks = []; const healthChecks = []; // 页面分析功能 class HealthChecker { constructor() { this.consoleMessages = []; this.networkErrors = []; this.logCounts = new Map(); // 用于统计重复日志 this.errorLogs = []; // 专门收集错误日志 } async setupPageListeners(page) { this.consoleMessages = []; this.networkErrors = []; this.logCounts.clear(); this.errorLogs = []; // *** REMOVED: 认证逻辑已移至 BrowserManager *** page.on('console', msg => { const logText = msg.text(); if (logText.includes('Download the React DevTools')) return; const log = { type: msg.type(), text: logText, toString: () => `[${msg.type().toUpperCase()}] ${logText}` }; const logKey = log.toString(); if (this.logCounts.has(logKey)) { this.logCounts.set(logKey, this.logCounts.get(logKey) + 1); } else { this.logCounts.set(logKey, 1); this.consoleMessages.push(log); } if (msg.type() === 'error') { if (!this.errorLogs.includes(logKey)) { this.errorLogs.push(logKey); } } }); page.on('pageerror', error => { const errorLog = `[PAGE_ERROR] ${error.message}`; if (this.logCounts.has(errorLog)) { this.logCounts.set(errorLog, this.logCounts.get(errorLog) + 1); } else { this.logCounts.set(errorLog, 1); this.errorLogs.push(errorLog); this.consoleMessages.push({ type: 'error', text: error.message, toString: () => errorLog }); } }); page.on('requestfailed', request => { const errorLog = `[NETWORK_ERROR] ${request.method()} ${request.url()} - ${request.failure()?.errorText || 'Unknown error'}`; if (this.logCounts.has(errorLog)) { this.logCounts.set(errorLog, this.logCounts.get(errorLog) + 1); } else { this.logCounts.set(errorLog, 1); this.networkErrors.push({ url: request.url(), failure: request.failure()?.errorText || 'Unknown error', toString: () => errorLog }); this.errorLogs.push(errorLog); this.consoleMessages.push({ type: 'error', text: `${request.method()} ${request.url()} - ${request.failure()?.errorText || 'Unknown error'}`, toString: () => errorLog }); } }); } async collectHealthCheckData(page, pageUrl, currentDepth, linksFound, linksFiltered) { const pageTitle = await page.title(); const finalUrl = page.url(); const { matchedPageKeywords, pageText } = await page.evaluate((keywords) => { const text = document.body ? document.body.innerText : ''; const lowerCaseText = text.toLowerCase(); const matches = keywords.filter(keyword => lowerCaseText.includes(keyword.toLowerCase())); return { matchedPageKeywords: matches, pageText: text }; }, pageKeywords); const matchedLogIssues = []; const consoleLogText = this.consoleMessages.map(m => m.text.toLowerCase()).join('\n'); logKeywords.forEach(keyword => { if (consoleLogText.includes(keyword.toLowerCase())) { matchedLogIssues.push(`日志包含关键字: ${keyword}`); } }); let statusCode = 200; try { const response = await page.waitForResponse(res => res.url() === finalUrl, { timeout: 5000 }).catch(() => null); if (response) { statusCode = response.status(); } } catch (e) { // 忽略 } const issues = []; if (matchedPageKeywords.length > 0) { issues.push(`页面内容包含关键字: ${matchedPageKeywords.join(', ')}`); } issues.push(...matchedLogIssues); if (this.networkErrors.length > 0) { issues.push(`存在网络错误(${this.networkErrors.length}条)`); } if (statusCode >= 400) { issues.push(`HTTP状态码异常: ${statusCode}`); } const processedConsoleMessages = this.consoleMessages.map(log => { const count = this.logCounts.get(log.toString()); return count > 1 ? `${log.toString()} (重复 ${count} 次)` : log.toString(); }); return { url: pageUrl, title: pageTitle, depth: currentDepth, statusCode: statusCode, linksFound: linksFound, linksFiltered: linksFiltered, pageText: pageText, consoleMessages: processedConsoleMessages, networkErrors: this.networkErrors.map(e => e.toString()), finalUrl: finalUrl, timestamp: new Date().toISOString(), health: { status: issues.length > 0 ? 'WARNING' : 'HEALTHY', issues: issues }, logStats: { uniqueLogs: this.consoleMessages.length, totalLogs: Array.from(this.logCounts.values()).reduce((sum, count) => sum + count, 0), uniqueErrors: this.errorLogs.length } }; } outputHealthSummary(healthData) { const status = healthData.health.status; const issuesText = healthData.health.issues.length > 0 ? `${healthData.health.issues.join('; ')}` : '无'; let finalUrlText = ''; if (healthData.finalUrl !== healthData.url) { finalUrlText = ` (重定向至 -> ${healthData.finalUrl})`; } const textLengthInfo = `文本长度: ${healthData.pageText ? healthData.pageText.length : 0}`; const logInfo = `日志: ${healthData.logStats.uniqueLogs}唯一/${healthData.logStats.totalLogs}总计`; logProgress('info', `[${status}] 页面: ${healthData.url}${finalUrlText} | ${textLengthInfo} | ${logInfo} | 问题: ${issuesText}`); } } function logProgress(level, message) { const timestamp = new Date().toISOString(); console.log(`[${timestamp}] [${level.toUpperCase()}] ${message}`); } async function performHealthCheck(pageUrl, currentDepth = 1, browserManager = null, isDirectAnalysis = false) { if (visitedUrls.has(pageUrl) || (!isDirectAnalysis && currentDepth > maxDepth)) { return null; } if (!isDirectAnalysis && skipUrlKeywords.some(keyword => pageUrl.includes(keyword))) { logProgress('debug', `URL包含跳过关键词,跳过: ${pageUrl}`); return []; } logProgress('info', `分析中... ${pageUrl} (深度: ${currentDepth})`); visitedUrls.add(pageUrl); let page = null; try { await browserManager.waitForSlot(); page = await browserManager.createPage(); // *** CORRECTED LOGIC: HealthChecker is now simpler *** const healthChecker = new HealthChecker(); await healthChecker.setupPageListeners(page); await page.goto(pageUrl, { waitUntil: 'domcontentloaded', timeout: CONFIG.pageTimeout }); await page.waitForLoadState('networkidle', { timeout: CONFIG.loadTimeout }).catch(() => { logProgress('warn', `页面 ${pageUrl} 网络空闲等待超时,继续分析...`); }); await new Promise(resolve => setTimeout(resolve, 10000)); let links = []; let filteredLinks = []; if (!isDirectAnalysis) { links = await page.evaluate((baseUrl) => { return Array.from(document.querySelectorAll('a'), anchor => { try { return new URL(anchor.href, baseUrl).href; } catch (e) { return null; } }).filter(Boolean); }, pageUrl); filteredLinks = filterLinks(links, pageUrl); allLinks.push(...filteredLinks); } const healthData = await healthChecker.collectHealthCheckData( page, pageUrl, currentDepth, links.length, filteredLinks.length ); healthChecks.push(healthData); healthChecker.outputHealthSummary(healthData); await browserManager.closePage(page); return filteredLinks.map(url => ({ url })); } catch (error) { logProgress('error', `分析页面时出错 ${pageUrl}: ${error.message}`); const failureReport = { url: pageUrl, title: 'Analysis Failed', depth: currentDepth, statusCode: null, pageText: null, consoleMessages: [`Error during analysis setup: ${error.message}`], networkErrors: [], finalUrl: pageUrl, timestamp: new Date().toISOString(), health: { status: 'ERROR', issues: [`分析失败: ${error.message}`] } }; healthChecks.push(failureReport); if (page) { await browserManager.closePage(page); } return []; } } async function performConcurrentHealthCheck(urls, currentDepth, browserManager, isDirectAnalysis = false) { logProgress('info', `并发分析 ${urls.length} 个URL (深度: ${currentDepth})`); const results = await Promise.allSettled( urls.map(url => performHealthCheck(url, currentDepth, browserManager, isDirectAnalysis)) ); const allDiscoveredLinks = []; results.forEach(result => { if (result.status === 'fulfilled' && result.value) { allDiscoveredLinks.push(...result.value); } }); return allDiscoveredLinks; } function filterLinks(links, sourceUrl) { const sourceHostname = new URL(sourceUrl).hostname; return links.filter(linkUrl => { try { const parsedUrl = new URL(linkUrl); const urlWithoutHash = linkUrl.split('#')[0]; if (visitedUrls.has(urlWithoutHash)) return false; if (parsedUrl.hostname !== sourceHostname) return false; const ext = path.extname(parsedUrl.pathname).toLowerCase(); if (['.jpg', '.png', '.gif', '.pdf', '.zip'].includes(ext)) return false; if (skipUrlKeywords.some(keyword => urlWithoutHash.includes(keyword))) return false; return true; } catch (e) { return false; } }).map(linkUrl => linkUrl.split('#')[0]); } function saveResults() { const domain = new URL(targetUrl).hostname; const domainDir = path.join(outputDir, domain); if (!fs.existsSync(domainDir)) { fs.mkdirSync(domainDir, { recursive: true }); } const healthyCount = healthChecks.filter(h => h.health.status === 'HEALTHY').length; const warningCount = healthChecks.filter(h => h.health.status === 'WARNING').length; const errorCount = healthChecks.filter(h => h.health.status === 'ERROR').length; const totalLogs = healthChecks.reduce((sum, check) => sum + (check.logStats?.totalLogs || 0), 0); const uniqueLogs = healthChecks.reduce((sum, check) => sum + (check.logStats?.uniqueLogs || 0), 0); const uniqueErrors = healthChecks.reduce((sum, check) => sum + (check.logStats?.uniqueErrors || 0), 0); console.log(`\n--- 分析总结 ---`); console.log(`总共分析页面: ${visitedUrls.size}`); console.log(` - 健康 [OK]: ${healthyCount}`); console.log(` - 警告 [WARN]: ${warningCount}`); console.log(` - 失败/错误 [ERROR]: ${errorCount}`); console.log(`日志统计:`); console.log(` - 唯一日志: ${uniqueLogs}`); console.log(` - 总日志数: ${totalLogs}`); console.log(` - 唯一错误: ${uniqueErrors}`); console.log(`报告输出目录: ${domainDir}`); return { domainDir }; } async function crawlAndAnalyze() { logProgress('info', `启动爬取分析模式...`); const startTime = Date.now(); // *** CORRECTED LOGIC: Pass the entire config object to BrowserManager *** const browserManager = new BrowserManager(maxConcurrency, config); try { let urlsToProcess = [targetUrl]; for (let depth = 1; depth <= maxDepth; depth++) { if (urlsToProcess.length === 0) { logProgress('info', `深度 ${depth}: 无新链接可分析,结束爬取。`); break; } const discoveredLinks = await performConcurrentHealthCheck(urlsToProcess, depth, browserManager); const nextUrls = [...new Set(discoveredLinks.map(link => link.url))]; urlsToProcess = nextUrls.filter(url => !visitedUrls.has(url)); } } catch (error) { logProgress('error', `爬取分析过程中发生严重错误: ${error.message}`); } finally { await browserManager.shutdown(); logProgress('info', `浏览器已关闭`); const { domainDir } = saveResults(); const jsonPath = outputJsonResults(domainDir); const duration = (Date.now() - startTime) / 1000; logProgress('info', `分析完成. 总耗时: ${duration.toFixed(2)}s`); console.log(jsonPath); } } async function analyzeDirectPages() { logProgress('info', `启动直接页面分析模式...`); const startTime = Date.now(); // *** CORRECTED LOGIC: Pass the entire config object to BrowserManager *** const browserManager = new BrowserManager(maxConcurrency, config); try { const urlSet = new Set(); urlSet.add(targetUrl); mainPageUrls.forEach(pagePath => { try { const fullUrl = new URL(pagePath, targetUrl).href; urlSet.add(fullUrl); } catch (e) { logProgress('warn', `无法将 "${pagePath}" 和基础URL "${targetUrl}" 组合,跳过此项。`); } }); const fullUrlsToAnalyze = Array.from(urlSet); logProgress('info', `将要分析的完整URL列表 (${fullUrlsToAnalyze.length}个): ${fullUrlsToAnalyze.join(', ')}`); await performConcurrentHealthCheck(fullUrlsToAnalyze, 1, browserManager, true); } catch (error) { logProgress('error', `直接页面分析过程中发生严重错误: ${error.message}`); } finally { await browserManager.shutdown(); logProgress('info', `浏览器已关闭`); const { domainDir } = saveResults(); const jsonPath = outputJsonResults(domainDir); const duration = (Date.now() - startTime) / 1000; logProgress('info', `分析完成. 总耗时: ${duration.toFixed(2)}s`); console.log(jsonPath); } } (async () => { try { initializeConfig(); console.log('\n=== Web Page Analyzer (精简版) ==='); const mode = mainPageUrls.length > 0 ? '直接页面分析' : '爬取分析'; console.log(`运行模式: ${mode}`); console.log(`基础URL: ${targetUrl}`); if (mode === '直接页面分析') { console.log(`额外指定路径数量: ${mainPageUrls.length}`); } else { console.log(`分析深度: ${maxDepth}`); } console.log(`页面内容关键字: ${pageKeywords.join(', ')}`); console.log(`日志内容关键字: ${logKeywords.join(', ')}`); console.log('===================================\n'); if (mainPageUrls.length > 0) { await analyzeDirectPages(); } else { await crawlAndAnalyze(); } } catch (error) { console.error('脚本执行出错:', error); process.exit(1); } finally { process.exit(0); } })(); function outputJsonResults(domainDir) { const jsonOutput = { summary: { total_pages_analyzed: visitedUrls.size, healthy_pages: healthChecks.filter(h => h.health.status === 'HEALTHY').length, warning_pages: healthChecks.filter(h => h.health.status === 'WARNING').length, error_pages: healthChecks.filter(h => h.health.status === 'ERROR').length, mode: mainPageUrls.length > 0 ? 'direct_analysis' : 'crawl', base_target: targetUrl, total_logs: healthChecks.reduce((sum, check) => sum + (check.logStats?.totalLogs || 0), 0), unique_logs: healthChecks.reduce((sum, check) => sum + (check.logStats?.uniqueLogs || 0), 0), unique_errors: healthChecks.reduce((sum, check) => sum + (check.logStats?.uniqueErrors || 0), 0), }, results: healthChecks.map(check => ({ url: check.url, final_url: check.finalUrl, title: check.title, status: check.health.status, status_code: check.statusCode, issues: check.health.issues, page_text: check.pageText, console_logs: check.consoleMessages, network_errors: check.networkErrors, log_stats: check.logStats || { uniqueLogs: 0, totalLogs: 0, uniqueErrors: 0 } })), timestamp: new Date().toISOString() }; const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); const jsonFileName = `analysis_report_${timestamp}.json`; const jsonFilePath = path.join(domainDir, jsonFileName); fs.writeFileSync(jsonFilePath, JSON.stringify(jsonOutput, null, 2)); const absoluteJsonPath = path.resolve(jsonFilePath); console.log(`JSON报告已保存至: ${absoluteJsonPath}`); console.log(absoluteJsonPath); return absoluteJsonPath; }