UNPKG

web-page-analyzer-cli

Version:

一个强大的网站链接抓取工具,支持深度抓取、认证和页面分析

339 lines (297 loc) 13 kB
#!/usr/bin/env node const { chromium } = require('playwright'); const fs = require('fs'); const path = require('path'); const { URL } = require('url'); /** * 显示帮助信息并退出 */ function showHelp() { console.log(` web-snapshot - 网页快照和控制台日志捕获工具 功能: 访问指定的URL,捕获其最终的DOM结构快照和浏览器控制台日志, 并将这些信息合并写入到一个文本文件中。 使用方法: web-snapshot <URL> [选项] 参数: URL 必需,要进行快照的目标网页URL。 认证选项: --cookie="name=value" 为页面请求设置Cookie。可多次使用。 --header="name:value" 为页面请求设置自定义HTTP头。可多次使用。 --localstorage="key=value"在页面加载前设置LocalStorage。可多次使用。 --token=<your-token> 设置Bearer Token认证的快捷方式。 --auth=<user:pass> 设置Basic Auth认证的快捷方式。 控制选项: --output=<file_path> 指定输出文件的路径。 (默认: ./output/snapshots/[hostname]/snapshot-[timestamp].txt) --output-dir=<dir> 指定输出目录 (默认: ./output/snapshots) --wait-for=<selector> 在捕获快照前等待指定的CSS选择器元素出现。 --wait-time=<ms> 页面加载后额外等待的毫秒数 (默认: 15000) -h, --help 显示此帮助信息。 `); process.exit(0); } /** * 解析命令行参数 * @returns {object} 解析后的配置对象 */ function parseArguments() { const args = process.argv.slice(2); if (args.includes('-h') || args.includes('--help') || args.length === 0) { showHelp(); } const options = { targetUrl: null, authOptions: { headers: {}, cookies: [], localstorage: [], }, output: null, outputDir: 'output/snapshots', waitForSelector: null, waitTime: 15000, }; let urlSet = false; for (const arg of args) { if (arg.startsWith('--cookie=')) { const cookieStr = arg.substring(9); const [name, ...valueParts] = cookieStr.split('='); const value = valueParts.join('='); if (name && value) { options.authOptions.cookies.push({ name: name.trim(), value: value.trim() }); } } else if (arg.startsWith('--header=')) { const headerStr = arg.substring(9); const [name, ...valueParts] = headerStr.split(':'); const value = valueParts.join(':'); if (name && value) { options.authOptions.headers[name.trim()] = value.trim(); } } else if (arg.startsWith('--localstorage=')) { const lsStr = arg.substring(15); const [key, ...valueParts] = lsStr.split('='); const value = valueParts.join('='); if (key && value) { options.authOptions.localstorage.push({ name: key.trim(), value: value.trim() }); } } else if (arg.startsWith('--token=')) { options.authOptions.headers['Authorization'] = `Bearer ${arg.substring(8)}`; } else if (arg.startsWith('--auth=')) { const [username, password] = arg.substring(7).split(':'); if (username && password) { options.authOptions.headers['Authorization'] = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`; } } else if (arg.startsWith('--output=')) { options.output = arg.substring(9); } else if (arg.startsWith('--output-dir=')) { options.outputDir = arg.substring(13); } else if (arg.startsWith('--wait-for=')) { options.waitForSelector = arg.substring(11); } else if (arg.startsWith('--wait-time=')) { const time = parseInt(arg.substring(12), 10); if (!isNaN(time)) { options.waitTime = time; } } else if (!arg.startsWith('-') && !urlSet) { options.targetUrl = arg; urlSet = true; } } if (!options.targetUrl) { console.error('错误: 必须提供目标URL。'); showHelp(); } try { new URL(options.targetUrl); } catch (e) { console.error(`错误: 无效的URL格式 -> ${options.targetUrl}`); process.exit(1); } const domain = new URL(options.targetUrl).hostname.replace(/[^a-z0-9-]/gi, '_'); const finalOutputDir = path.join(options.outputDir, domain); if (!fs.existsSync(finalOutputDir)) { fs.mkdirSync(finalOutputDir, { recursive: true }); } if (!options.output) { const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); options.output = path.join(finalOutputDir, `snapshot-${timestamp}.txt`); } return options; } /** * 在浏览器上下文中执行,生成页面的文本快照 (保持不变) */ function generateDomSnapshot() { const ATTRIBUTE_WHITELIST = ['href', 'src', 'alt', 'title', 'for', 'name', 'type', 'value', 'placeholder', 'role', 'aria-label', 'aria-labelledby', 'data-testid']; const BOOLEAN_ATTRIBUTES = ['disabled', 'required', 'checked', 'selected', 'readonly']; function getElementText(element) { let text = ''; for (const node of element.childNodes) { if (node.nodeType === Node.TEXT_NODE && node.textContent.trim()) { text += node.textContent.trim() + ' '; } } text = text.trim(); return text.length > 500 ? text.substring(0, 497) + '...' : text; } function buildSnapshot(element, depth) { if (!element || !element.tagName || depth > 20) return ''; const style = window.getComputedStyle(element); if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') return ''; const tag = element.tagName.toLowerCase(); if (['script', 'style', 'meta', 'link', 'head'].includes(tag)) return ''; if (tag === 'svg') return ' '.repeat(depth) + `- <svg> [icon]\n`; const indent = ' '.repeat(depth); let attributes = ''; if (element.id) attributes += ` id="${element.id}"`; if (element.className && typeof element.className === 'string') attributes += ` class="${element.className}"`; ATTRIBUTE_WHITELIST.forEach(attr => { if (element.hasAttribute(attr)) attributes += ` ${attr}="${element.getAttribute(attr)}"`; }); BOOLEAN_ATTRIBUTES.forEach(attr => { if (element.hasAttribute(attr)) attributes += ` [${attr}]`; }); let line = `${indent}- <${tag}${attributes}>`; const text = getElementText(element); if (text) line += ` "${text}"`; let result = line + '\n'; for (const child of element.children) result += buildSnapshot(child, depth + 1); return result; } return document.body ? buildSnapshot(document.body, 0) : '页面没有 <body> 标签,无法生成快照。'; } /** * 主函数,执行快照任务 * @param {object} options - 解析后的配置 */ async function takeSnapshotAndLogs(options) { const { targetUrl, authOptions, output, waitForSelector, waitTime } = options; console.log(`[INFO] 正在启动浏览器...`); // --- 统一的认证信息准备 --- const contextOptions = {}; const { headers, cookies, localstorage } = authOptions; const targetParsedUrl = new URL(targetUrl); const targetOrigin = targetParsedUrl.origin; const targetDomain = targetParsedUrl.hostname; if (Object.keys(headers).length > 0) { contextOptions.extraHTTPHeaders = headers; console.log('[INFO] 准备设置自定义 Headers。'); } const storageState = { cookies: [], origins: [] }; if (cookies.length > 0) { storageState.cookies = cookies.map(c => ({ ...c, domain: c.domain || targetDomain, path: c.path || '/' })); console.log('[INFO] 准备设置自定义 Cookies。'); } if (localstorage.length > 0) { storageState.origins.push({ origin: targetOrigin, localStorage: localstorage }); console.log('[INFO] 准备设置 LocalStorage。'); } if (storageState.cookies.length > 0 || storageState.origins.length > 0) { contextOptions.storageState = storageState; } // --- 认证信息准备结束 --- const browser = await chromium.launch({ headless: true }); const context = await browser.newContext(contextOptions); const page = await context.newPage(); const consoleLogs = []; const errorLogs = []; const logCounts = new Map(); // 日志捕获逻辑 (保持不变) page.on('console', msg => { const logText = msg.text(); if (logText.includes('Download the React DevTools')) return; const log = `[CONSOLE.${msg.type().toUpperCase()}] ${logText}`; logCounts.set(log, (logCounts.get(log) || 0) + 1); if (logCounts.get(log) === 1) consoleLogs.push(log); if (msg.type() === 'error' && !errorLogs.includes(log)) errorLogs.push(log); }); page.on('pageerror', error => { const errorLog = `[PAGE_ERROR] ${error.message}`; logCounts.set(errorLog, (logCounts.get(errorLog) || 0) + 1); if (logCounts.get(errorLog) === 1) consoleLogs.push(errorLog); if (!errorLogs.includes(errorLog)) errorLogs.push(errorLog); }); page.on('requestfailed', request => { const errorLog = `[REQUEST_FAILED] ${request.method()} ${request.url()} - ${request.failure()?.errorText || 'Unknown error'}`; logCounts.set(errorLog, (logCounts.get(errorLog) || 0) + 1); if (logCounts.get(errorLog) === 1) consoleLogs.push(errorLog); if (!errorLogs.includes(errorLog)) errorLogs.push(errorLog); }); try { console.log(`[INFO] 正在导航到: ${targetUrl}`); const response = await page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout: 30000 }); console.log(`[INFO] 页面加载完成,等待网络活动停止...`); await page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => { console.warn('[WARN] 等待网络空闲超时,可能仍有后台活动。'); }); if (waitForSelector) { console.log(`[INFO] 正在等待选择器: ${waitForSelector}`); await page.waitForSelector(waitForSelector, { timeout: 15000 }); } if (waitTime > 0) { console.log(`[INFO] 额外等待 ${waitTime}ms 以确保动态内容加载...`); await page.waitForTimeout(waitTime); } console.log('[INFO] 正在捕获DOM快照和控制台日志...'); const domSnapshot = await page.evaluate(generateDomSnapshot); const pageTitle = await page.title(); const finalUrl = page.url(); const statusCode = response ? response.status() : 'N/A'; const processedConsoleLogs = consoleLogs.map(log => { const count = logCounts.get(log); return count > 1 ? `${log} (重复 ${count} 次)` : log; }); const outputContent = ` ############################################################# # Web Snapshot Report ############################################################# - Original URL: ${targetUrl} - Final URL: ${finalUrl} - Page Title: ${pageTitle} - HTTP Status: ${statusCode} - Timestamp: ${new Date().toISOString()} - Unique Logs: ${consoleLogs.length} - Total Logs: ${Array.from(logCounts.values()).reduce((s, c) => s + c, 0)} - Unique Errors:${errorLogs.length} ############################################################# # DOM Snapshot ############################################################# ${domSnapshot.trim() || 'DOM Snapshot is empty or cannot be generated.'} ############################################################# # Console Logs ############################################################# ${processedConsoleLogs.length > 0 ? processedConsoleLogs.join('\n') : 'No console logs captured.'} `; fs.writeFileSync(output, outputContent.trim()); const absolutePath = path.resolve(output); console.log(`\n[SUCCESS] 快照成功保存到: ${absolutePath}`); console.log(absolutePath); // 输出绝对路径,便于脚本间调用 } catch (error) { console.error(`\n[ERROR] 操作失败: ${error.message}`); // 失败时也尝试保存日志 const processedConsoleLogs = consoleLogs.map(log => `${log} (重复 ${logCounts.get(log)} 次)`); const errorContent = ` ############################################################# # Web Snapshot - FAILED ############################################################# - URL: ${targetUrl} - Timestamp: ${new Date().toISOString()} - Error: ${error.message} ------------------------------------------------------------- # Captured Console Logs Before Failure ------------------------------------------------------------- ${processedConsoleLogs.length > 0 ? processedConsoleLogs.join('\n') : '没有捕获到控制台日志。'} `; fs.writeFileSync(output, errorContent.trim()); const absolutePath = path.resolve(output); console.log(`[INFO] 包含错误信息的日志已尝试保存到: ${absolutePath}`); console.log(absolutePath); process.exit(1); } finally { console.log('[INFO] 正在关闭浏览器...'); await browser.close(); } } // 脚本入口 (async () => { const options = parseArguments(); await takeSnapshotAndLogs(options); })();