web-page-analyzer-cli
Version:
一个强大的网站链接抓取工具,支持深度抓取、认证和页面分析
339 lines (297 loc) • 13 kB
JavaScript
const { chromium } = require('playwright');
const fs = require('fs');
const path = require('path');
const { URL } = require('url');
/**
* 显示帮助信息并退出
*/
function showHelp() {
console.log(`
web-snapshot - 网页快照和控制台日志捕获工具
功能:
访问指定的URL,捕获其最终的DOM结构快照和浏览器控制台日志,
并将这些信息合并写入到一个文本文件中。
使用方法:
web-snapshot <URL> [选项]
参数:
URL 必需,要进行快照的目标网页URL。
认证选项:
--cookie="name=value" 为页面请求设置Cookie。可多次使用。
--header="name:value" 为页面请求设置自定义HTTP头。可多次使用。
--localstorage="key=value"在页面加载前设置LocalStorage。可多次使用。
--token=<your-token> 设置Bearer Token认证的快捷方式。
--auth=<user:pass> 设置Basic Auth认证的快捷方式。
控制选项:
--output=<file_path> 指定输出文件的路径。
(默认: ./output/snapshots/[hostname]/snapshot-[timestamp].txt)
--output-dir=<dir> 指定输出目录 (默认: ./output/snapshots)
--wait-for=<selector> 在捕获快照前等待指定的CSS选择器元素出现。
--wait-time=<ms> 页面加载后额外等待的毫秒数 (默认: 15000)
-h, --help 显示此帮助信息。
`);
process.exit(0);
}
/**
* 解析命令行参数
* @returns {object} 解析后的配置对象
*/
function parseArguments() {
const args = process.argv.slice(2);
if (args.includes('-h') || args.includes('--help') || args.length === 0) {
showHelp();
}
const options = {
targetUrl: null,
authOptions: {
headers: {},
cookies: [],
localstorage: [],
},
output: null,
outputDir: 'output/snapshots',
waitForSelector: null,
waitTime: 15000,
};
let urlSet = false;
for (const arg of args) {
if (arg.startsWith('--cookie=')) {
const cookieStr = arg.substring(9);
const [name, ...valueParts] = cookieStr.split('=');
const value = valueParts.join('=');
if (name && value) {
options.authOptions.cookies.push({ name: name.trim(), value: value.trim() });
}
} else if (arg.startsWith('--header=')) {
const headerStr = arg.substring(9);
const [name, ...valueParts] = headerStr.split(':');
const value = valueParts.join(':');
if (name && value) {
options.authOptions.headers[name.trim()] = value.trim();
}
} else if (arg.startsWith('--localstorage=')) {
const lsStr = arg.substring(15);
const [key, ...valueParts] = lsStr.split('=');
const value = valueParts.join('=');
if (key && value) {
options.authOptions.localstorage.push({ name: key.trim(), value: value.trim() });
}
} else if (arg.startsWith('--token=')) {
options.authOptions.headers['Authorization'] = `Bearer ${arg.substring(8)}`;
} else if (arg.startsWith('--auth=')) {
const [username, password] = arg.substring(7).split(':');
if (username && password) {
options.authOptions.headers['Authorization'] = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
}
} else if (arg.startsWith('--output=')) {
options.output = arg.substring(9);
} else if (arg.startsWith('--output-dir=')) {
options.outputDir = arg.substring(13);
} else if (arg.startsWith('--wait-for=')) {
options.waitForSelector = arg.substring(11);
} else if (arg.startsWith('--wait-time=')) {
const time = parseInt(arg.substring(12), 10);
if (!isNaN(time)) {
options.waitTime = time;
}
} else if (!arg.startsWith('-') && !urlSet) {
options.targetUrl = arg;
urlSet = true;
}
}
if (!options.targetUrl) {
console.error('错误: 必须提供目标URL。');
showHelp();
}
try {
new URL(options.targetUrl);
} catch (e) {
console.error(`错误: 无效的URL格式 -> ${options.targetUrl}`);
process.exit(1);
}
const domain = new URL(options.targetUrl).hostname.replace(/[^a-z0-9-]/gi, '_');
const finalOutputDir = path.join(options.outputDir, domain);
if (!fs.existsSync(finalOutputDir)) {
fs.mkdirSync(finalOutputDir, { recursive: true });
}
if (!options.output) {
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
options.output = path.join(finalOutputDir, `snapshot-${timestamp}.txt`);
}
return options;
}
/**
* 在浏览器上下文中执行,生成页面的文本快照 (保持不变)
*/
function generateDomSnapshot() {
const ATTRIBUTE_WHITELIST = ['href', 'src', 'alt', 'title', 'for', 'name', 'type', 'value', 'placeholder', 'role', 'aria-label', 'aria-labelledby', 'data-testid'];
const BOOLEAN_ATTRIBUTES = ['disabled', 'required', 'checked', 'selected', 'readonly'];
function getElementText(element) {
let text = '';
for (const node of element.childNodes) {
if (node.nodeType === Node.TEXT_NODE && node.textContent.trim()) {
text += node.textContent.trim() + ' ';
}
}
text = text.trim();
return text.length > 500 ? text.substring(0, 497) + '...' : text;
}
function buildSnapshot(element, depth) {
if (!element || !element.tagName || depth > 20) return '';
const style = window.getComputedStyle(element);
if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') return '';
const tag = element.tagName.toLowerCase();
if (['script', 'style', 'meta', 'link', 'head'].includes(tag)) return '';
if (tag === 'svg') return ' '.repeat(depth) + `- <svg> [icon]\n`;
const indent = ' '.repeat(depth);
let attributes = '';
if (element.id) attributes += ` id="${element.id}"`;
if (element.className && typeof element.className === 'string') attributes += ` class="${element.className}"`;
ATTRIBUTE_WHITELIST.forEach(attr => { if (element.hasAttribute(attr)) attributes += ` ${attr}="${element.getAttribute(attr)}"`; });
BOOLEAN_ATTRIBUTES.forEach(attr => { if (element.hasAttribute(attr)) attributes += ` [${attr}]`; });
let line = `${indent}- <${tag}${attributes}>`;
const text = getElementText(element);
if (text) line += ` "${text}"`;
let result = line + '\n';
for (const child of element.children) result += buildSnapshot(child, depth + 1);
return result;
}
return document.body ? buildSnapshot(document.body, 0) : '页面没有 <body> 标签,无法生成快照。';
}
/**
* 主函数,执行快照任务
* @param {object} options - 解析后的配置
*/
async function takeSnapshotAndLogs(options) {
const { targetUrl, authOptions, output, waitForSelector, waitTime } = options;
console.log(`[INFO] 正在启动浏览器...`);
// --- 统一的认证信息准备 ---
const contextOptions = {};
const { headers, cookies, localstorage } = authOptions;
const targetParsedUrl = new URL(targetUrl);
const targetOrigin = targetParsedUrl.origin;
const targetDomain = targetParsedUrl.hostname;
if (Object.keys(headers).length > 0) {
contextOptions.extraHTTPHeaders = headers;
console.log('[INFO] 准备设置自定义 Headers。');
}
const storageState = { cookies: [], origins: [] };
if (cookies.length > 0) {
storageState.cookies = cookies.map(c => ({ ...c, domain: c.domain || targetDomain, path: c.path || '/' }));
console.log('[INFO] 准备设置自定义 Cookies。');
}
if (localstorage.length > 0) {
storageState.origins.push({ origin: targetOrigin, localStorage: localstorage });
console.log('[INFO] 准备设置 LocalStorage。');
}
if (storageState.cookies.length > 0 || storageState.origins.length > 0) {
contextOptions.storageState = storageState;
}
// --- 认证信息准备结束 ---
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext(contextOptions);
const page = await context.newPage();
const consoleLogs = [];
const errorLogs = [];
const logCounts = new Map();
// 日志捕获逻辑 (保持不变)
page.on('console', msg => {
const logText = msg.text();
if (logText.includes('Download the React DevTools')) return;
const log = `[CONSOLE.${msg.type().toUpperCase()}] ${logText}`;
logCounts.set(log, (logCounts.get(log) || 0) + 1);
if (logCounts.get(log) === 1) consoleLogs.push(log);
if (msg.type() === 'error' && !errorLogs.includes(log)) errorLogs.push(log);
});
page.on('pageerror', error => {
const errorLog = `[PAGE_ERROR] ${error.message}`;
logCounts.set(errorLog, (logCounts.get(errorLog) || 0) + 1);
if (logCounts.get(errorLog) === 1) consoleLogs.push(errorLog);
if (!errorLogs.includes(errorLog)) errorLogs.push(errorLog);
});
page.on('requestfailed', request => {
const errorLog = `[REQUEST_FAILED] ${request.method()} ${request.url()} - ${request.failure()?.errorText || 'Unknown error'}`;
logCounts.set(errorLog, (logCounts.get(errorLog) || 0) + 1);
if (logCounts.get(errorLog) === 1) consoleLogs.push(errorLog);
if (!errorLogs.includes(errorLog)) errorLogs.push(errorLog);
});
try {
console.log(`[INFO] 正在导航到: ${targetUrl}`);
const response = await page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
console.log(`[INFO] 页面加载完成,等待网络活动停止...`);
await page.waitForLoadState('networkidle', { timeout: 15000 }).catch(() => {
console.warn('[WARN] 等待网络空闲超时,可能仍有后台活动。');
});
if (waitForSelector) {
console.log(`[INFO] 正在等待选择器: ${waitForSelector}`);
await page.waitForSelector(waitForSelector, { timeout: 15000 });
}
if (waitTime > 0) {
console.log(`[INFO] 额外等待 ${waitTime}ms 以确保动态内容加载...`);
await page.waitForTimeout(waitTime);
}
console.log('[INFO] 正在捕获DOM快照和控制台日志...');
const domSnapshot = await page.evaluate(generateDomSnapshot);
const pageTitle = await page.title();
const finalUrl = page.url();
const statusCode = response ? response.status() : 'N/A';
const processedConsoleLogs = consoleLogs.map(log => {
const count = logCounts.get(log);
return count > 1 ? `${log} (重复 ${count} 次)` : log;
});
const outputContent = `
#############################################################
# Web Snapshot Report
#############################################################
- Original URL: ${targetUrl}
- Final URL: ${finalUrl}
- Page Title: ${pageTitle}
- HTTP Status: ${statusCode}
- Timestamp: ${new Date().toISOString()}
- Unique Logs: ${consoleLogs.length}
- Total Logs: ${Array.from(logCounts.values()).reduce((s, c) => s + c, 0)}
- Unique Errors:${errorLogs.length}
#############################################################
# DOM Snapshot
#############################################################
${domSnapshot.trim() || 'DOM Snapshot is empty or cannot be generated.'}
#############################################################
# Console Logs
#############################################################
${processedConsoleLogs.length > 0 ? processedConsoleLogs.join('\n') : 'No console logs captured.'}
`;
fs.writeFileSync(output, outputContent.trim());
const absolutePath = path.resolve(output);
console.log(`\n[SUCCESS] 快照成功保存到: ${absolutePath}`);
console.log(absolutePath); // 输出绝对路径,便于脚本间调用
} catch (error) {
console.error(`\n[ERROR] 操作失败: ${error.message}`);
// 失败时也尝试保存日志
const processedConsoleLogs = consoleLogs.map(log => `${log} (重复 ${logCounts.get(log)} 次)`);
const errorContent = `
#############################################################
# Web Snapshot - FAILED
#############################################################
- URL: ${targetUrl}
- Timestamp: ${new Date().toISOString()}
- Error: ${error.message}
-------------------------------------------------------------
# Captured Console Logs Before Failure
-------------------------------------------------------------
${processedConsoleLogs.length > 0 ? processedConsoleLogs.join('\n') : '没有捕获到控制台日志。'}
`;
fs.writeFileSync(output, errorContent.trim());
const absolutePath = path.resolve(output);
console.log(`[INFO] 包含错误信息的日志已尝试保存到: ${absolutePath}`);
console.log(absolutePath);
process.exit(1);
} finally {
console.log('[INFO] 正在关闭浏览器...');
await browser.close();
}
}
// 脚本入口
(async () => {
const options = parseArguments();
await takeSnapshotAndLogs(options);
})();