web-image-crawler
Version:
790 lines (703 loc) • 27.1 kB
JavaScript
const fs = require('fs').promises;
const path = require('path');
const crypto = require('crypto');
const mimeTypes = require('mime-types');
const { PlaywrightCrawler } = require('crawlee');
const axios = require('axios');
/**
* 强大的图片爬虫 - 基于 Playwright 浏览器引擎
* 支持动态内容渲染、JavaScript执行、复杂交互
*/
async function crawlImages(startUrl, options = {}) {
const {
maxDepth = 4,
outputFile = 'images_report.json',
outputFormat = 'json',
maxConcurrent = 3,
timeout = 30000,
headless = true,
imageFilter = null,
minSize = 0,
excludeDataUri = false,
downloadImages = false,
outputDir = './images',
userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
waitForImages = true,
captureScreenshots = false,
detectLazyLoad = true,
includeBackgrounds = true,
includeHiddenImages = false
} = options;
// 创建结果收集器
const downloadedImages = [];
const failedDownloads = [];
const imageUrls = new Set();
const screenshotUrls = [];
// 确保输出目录存在
if (downloadImages || captureScreenshots) {
await fs.mkdir(outputDir, { recursive: true });
if (captureScreenshots) {
await fs.mkdir(path.join(outputDir, 'screenshots'), { recursive: true });
}
}
// 设置Crawlee存储目录到用户指定的输出目录中
const crawleeStorageDir = path.join(outputDir, '.crawlee-storage');
process.env.CRAWLEE_STORAGE_DIR = crawleeStorageDir;
console.log(`🚀 启动强大图片爬虫 (Playwright Browser 模式)`);
console.log(`📋 配置: maxDepth=${maxDepth}, 并发=${maxConcurrent}, 超时=${timeout}ms`);
console.log(`🎯 目标: ${startUrl}`);
console.log(`📁 输出目录: ${outputDir}`);
// 创建PlaywrightCrawler实例
const crawler = new PlaywrightCrawler({
maxRequestsPerCrawl: 1000,
maxConcurrency: maxConcurrent,
requestHandlerTimeoutSecs: Math.max(timeout / 1000, 60),
// Playwright配置
launchContext: {
launchOptions: {
headless,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-web-security',
'--disable-features=VizDisplayCompositor',
'--disable-blink-features=AutomationControlled',
`--user-agent=${userAgent}`
]
}
},
// 预处理钩子
preNavigationHooks: [
async ({ page }) => {
// 设置用户代理
await page.setExtraHTTPHeaders({
'User-Agent': userAgent
});
// 设置视口大小
await page.setViewportSize({ width: 1920, height: 1080 });
// 设置超时
page.setDefaultTimeout(timeout);
// 阻止不必要的资源加载以提高性能
await page.route('**/*', (route) => {
const resourceType = route.request().resourceType();
if (['font', 'media'].includes(resourceType)) {
route.abort();
} else {
route.continue();
}
});
// 注入懒加载检测脚本
if (detectLazyLoad) {
await page.addInitScript(() => {
// 触发懒加载
window.triggerLazyLoad = () => {
// 滚动页面触发懒加载
window.scrollTo(0, document.body.scrollHeight);
// 触发intersection observer
const images = document.querySelectorAll('img[data-src], img[data-lazy], img[loading="lazy"]');
images.forEach(img => {
if (img.dataset.src) {
img.src = img.dataset.src;
}
if (img.dataset.lazy) {
img.src = img.dataset.lazy;
}
});
};
});
}
}
],
// 主处理函数
requestHandler: async ({ request, page, log, enqueueLinks }) => {
const currentDepth = request.userData.depth || 1;
const currentUrl = request.url;
log.info(`[${currentDepth}/${maxDepth}] 🎭 Playwright爬取: ${currentUrl}`);
try {
// 等待页面加载完成
await page.waitForLoadState('networkidle', { timeout });
// 等待图片加载
if (waitForImages) {
await page.waitForFunction(() => {
const images = Array.from(document.querySelectorAll('img'));
return images.every(img => img.complete || img.naturalWidth > 0);
}, { timeout: 15000 }).catch(() => {
log.warning('部分图片未完全加载');
});
}
// 触发懒加载
if (detectLazyLoad) {
await page.evaluate(() => {
if (window.triggerLazyLoad) {
window.triggerLazyLoad();
}
});
// 等待懒加载图片
await page.waitForTimeout(2000);
}
// 页面截图
if (captureScreenshots) {
const screenshotPath = path.join(outputDir, 'screenshots', `${Date.now()}_${currentDepth}.png`);
await page.screenshot({
path: screenshotPath,
fullPage: true,
type: 'png'
});
screenshotUrls.push({
url: currentUrl,
screenshotPath,
depth: currentDepth,
timestamp: new Date().toISOString()
});
log.info(`📸 页面截图已保存: ${screenshotPath}`);
}
// 在浏览器中执行图片提取
const pageImages = await page.evaluate((options) => {
const images = [];
const { includeBackgrounds, includeHiddenImages } = options;
// 1. 提取 <img> 标签 (包括srcset和懒加载)
document.querySelectorAll('img').forEach(img => {
const rect = img.getBoundingClientRect();
const isVisible = rect.width > 0 && rect.height > 0;
if (!includeHiddenImages && !isVisible) return;
// 主要src
if (img.src && img.src !== window.location.href) {
images.push({
url: img.src,
type: 'img-tag',
element: 'img',
alt: img.alt || '',
title: img.title || '',
width: img.naturalWidth || rect.width,
height: img.naturalHeight || rect.height,
visible: isVisible,
loading: img.loading || 'eager',
className: img.className,
id: img.id
});
}
// 处理 srcset
if (img.srcset) {
const srcsetUrls = img.srcset.split(',').map(src => src.trim().split(' ')[0]);
srcsetUrls.forEach(srcUrl => {
if (srcUrl && srcUrl !== img.src) {
images.push({
url: srcUrl,
type: 'img-srcset',
element: 'img',
alt: img.alt || '',
title: img.title || '',
width: img.naturalWidth || rect.width,
height: img.naturalHeight || rect.height,
visible: isVisible,
srcsetSource: true
});
}
});
}
// 懒加载属性
['data-src', 'data-lazy', 'data-original', 'data-echo'].forEach(attr => {
const lazySrc = img.getAttribute(attr);
if (lazySrc && lazySrc !== img.src) {
images.push({
url: lazySrc,
type: 'img-lazy',
element: 'img',
alt: img.alt || '',
title: img.title || '',
width: img.naturalWidth || rect.width,
height: img.naturalHeight || rect.height,
visible: isVisible,
lazyLoadAttr: attr
});
}
});
});
// 2. 提取 CSS background-image (包括伪元素)
if (includeBackgrounds) {
document.querySelectorAll('*').forEach(el => {
const rect = el.getBoundingClientRect();
const isVisible = rect.width > 0 && rect.height > 0;
if (!includeHiddenImages && !isVisible) return;
const style = window.getComputedStyle(el);
const bgImage = style.getPropertyValue('background-image');
if (bgImage && bgImage !== 'none') {
const urlMatches = bgImage.match(/url\(['"]?([^'"]+)['"]?\)/g);
if (urlMatches) {
urlMatches.forEach(match => {
const urlMatch = match.match(/url\(['"]?([^'"]+)['"]?\)/);
if (urlMatch && urlMatch[1]) {
images.push({
url: urlMatch[1],
type: 'background-image',
element: el.tagName.toLowerCase(),
selector: getElementSelector(el),
width: rect.width,
height: rect.height,
visible: isVisible,
className: el.className,
id: el.id
});
}
});
}
// 检查伪元素背景
['::before', '::after'].forEach(pseudo => {
try {
const pseudoStyle = window.getComputedStyle(el, pseudo);
const pseudoBgImage = pseudoStyle.getPropertyValue('background-image');
if (pseudoBgImage && pseudoBgImage !== 'none') {
const urlMatch = pseudoBgImage.match(/url\(['"]?([^'"]+)['"]?\)/);
if (urlMatch && urlMatch[1]) {
images.push({
url: urlMatch[1],
type: 'pseudo-background',
element: el.tagName.toLowerCase() + pseudo,
selector: getElementSelector(el) + pseudo,
width: 0,
height: 0,
visible: false,
pseudoElement: pseudo
});
}
}
} catch (e) {
// 忽略伪元素错误
}
});
}
});
}
// 3. 提取其他类型图片
const selectors = [
'svg image[href], svg image[xlink\\:href]',
'picture source[srcset]',
'video[poster]',
'canvas',
'embed[src]',
'object[data]',
'link[rel*="icon"]'
];
selectors.forEach(selector => {
document.querySelectorAll(selector).forEach(el => {
const tagName = el.tagName.toLowerCase();
let imageUrl = null;
let imageType = 'unknown';
if (tagName === 'image') {
imageUrl = el.getAttribute('href') || el.getAttribute('xlink:href');
imageType = 'svg-image';
} else if (tagName === 'source') {
const srcset = el.getAttribute('srcset');
if (srcset) {
const urls = srcset.split(',').map(src => src.trim().split(' ')[0]);
urls.forEach(url => {
if (url) {
images.push({
url,
type: 'picture-source',
element: 'picture source',
media: el.getAttribute('media') || '',
width: 0,
height: 0,
visible: true
});
}
});
}
return;
} else if (tagName === 'video') {
imageUrl = el.getAttribute('poster');
imageType = 'video-poster';
} else if (tagName === 'canvas') {
try {
if (el.width > 0 && el.height > 0) {
imageUrl = el.toDataURL('image/png');
imageType = 'canvas-snapshot';
}
} catch (e) {
console.warn('无法访问canvas:', e.message);
}
} else if (tagName === 'embed' || tagName === 'object') {
imageUrl = el.getAttribute('src') || el.getAttribute('data');
imageType = 'embed-object';
} else if (tagName === 'link') {
imageUrl = el.getAttribute('href');
imageType = 'favicon';
}
if (imageUrl) {
const rect = el.getBoundingClientRect();
images.push({
url: imageUrl,
type: imageType,
element: tagName,
width: rect.width || el.width || 0,
height: rect.height || el.height || 0,
visible: rect.width > 0 && rect.height > 0,
rel: el.getAttribute('rel') || '',
media: el.getAttribute('media') || ''
});
}
});
});
// 辅助函数
function getElementSelector(el) {
if (!el || el.nodeType !== 1) return '';
if (el.id) return `#${el.id}`;
const parts = [];
let ancestor = el;
while (ancestor && ancestor !== document.body) {
let selector = ancestor.tagName.toLowerCase();
if (ancestor.className) {
const classes = ancestor.className.trim().split(/\s+/).filter(c => c);
if (classes.length > 0) {
selector += `.${classes.slice(0, 2).join('.')}`;
}
}
parts.unshift(selector);
ancestor = ancestor.parentElement;
if (parts.length >= 3) break;
}
return parts.join(' > ');
}
// 处理相对URL
return images
.filter(img => img.url)
.map(img => {
if (img.url.startsWith('data:')) {
return {
...img,
isDataUri: true,
fileSize: Math.round(img.url.length * 0.75)
};
}
try {
const absoluteUrl = new URL(img.url, window.location.href).href;
return { ...img, url: absoluteUrl, isDataUri: false };
} catch (e) {
console.warn('无效URL:', img.url);
return null;
}
})
.filter(Boolean);
}, { includeBackgrounds, includeHiddenImages });
// 应用过滤条件
const filteredImages = pageImages.filter(img => {
if (excludeDataUri && img.isDataUri) return false;
if (imageFilter && imageFilter.length > 0) {
const fileExtension = getFileExtension(img.url);
if (!imageFilter.includes(fileExtension)) return false;
}
if (img.isDataUri && img.fileSize < minSize) return false;
if (img.width && img.height && (img.width * img.height) < minSize) return false;
return true;
});
// 下载图片或收集URL
if (downloadImages) {
for (const img of filteredImages) {
try {
const downloadResult = await downloadImage(img, outputDir, currentUrl);
if (downloadResult.success) {
downloadedImages.push(downloadResult);
log.info(` ✅ 下载成功: ${downloadResult.filename}`);
} else {
failedDownloads.push({ ...img, error: downloadResult.error });
log.warning(` ❌ 下载失败: ${img.url} - ${downloadResult.error}`);
}
} catch (error) {
failedDownloads.push({ ...img, error: error.message });
log.error(` ❌ 下载失败: ${img.url} - ${error.message}`);
}
}
} else {
filteredImages.forEach(img => {
imageUrls.add(JSON.stringify({
...img,
crawledFrom: currentUrl,
crawledAt: new Date().toISOString(),
depth: currentDepth
}));
});
}
log.info(` 📸 找到 ${filteredImages.length} 个图片`);
// 继续爬取子页面
if (currentDepth < maxDepth) {
await enqueueLinks({
selector: 'a[href]',
limit: 20,
userData: { depth: currentDepth + 1 },
transformRequestFunction: (req) => {
const startOrigin = new URL(startUrl).origin;
const reqOrigin = new URL(req.url).origin;
if (reqOrigin !== startOrigin) {
return false;
}
return req;
}
});
}
} catch (error) {
log.error(`处理页面时出错: ${error.message}`);
}
},
failedRequestHandler: async ({ request, log }) => {
log.error(`请求失败: ${request.url}`);
},
});
// 辅助函数
function getFileExtension(url) {
if (url.startsWith('data:image/')) {
const match = url.match(/data:image\/([^;]+)/);
return match ? match[1] : '';
}
const match = url.match(/\.([^.?#]+)(?:\?|#|$)/);
return match ? match[1].toLowerCase() : '';
}
async function downloadImage(imageInfo, outputDir, pageUrl) {
try {
if (imageInfo.isDataUri) {
return await saveBase64Image(imageInfo, outputDir);
} else {
return await downloadUrlImage(imageInfo, outputDir, pageUrl);
}
} catch (error) {
return { success: false, error: error.message };
}
}
async function saveBase64Image(imageInfo, outputDir) {
try {
const matches = imageInfo.url.match(/^data:image\/([^;]+);base64,(.+)$/);
if (!matches) {
throw new Error('无效的Base64图片格式');
}
const [, mimeType, base64Data] = matches;
const extension = mimeType || 'png';
const hash = crypto.createHash('md5').update(base64Data).digest('hex');
const filename = `base64_${hash}.${extension}`;
const filepath = path.join(outputDir, filename);
try {
await fs.access(filepath);
return { success: true, filename, filepath, skipped: true };
} catch {
const buffer = Buffer.from(base64Data, 'base64');
await fs.writeFile(filepath, buffer);
return {
success: true,
filename,
filepath,
size: buffer.length,
type: imageInfo.type,
skipped: false
};
}
} catch (error) {
return { success: false, error: error.message };
}
}
async function downloadUrlImage(imageInfo, outputDir, pageUrl) {
try {
const imageUrl = imageInfo.url;
const urlObj = new URL(imageUrl);
let pathname = urlObj.pathname;
// 首先用原始路径生成基础文件名(不考虑扩展名)
pathname = pathname.replace(/^\//, '');
// 先请求图片获取 content-type
const response = await axios({
method: 'GET',
url: imageUrl,
responseType: 'stream',
timeout: 30000,
headers: {
'User-Agent': userAgent,
'Referer': pageUrl
}
});
// 根据 content-type 确定正确的扩展名
const contentType = response.headers['content-type'];
let correctExtension = null;
if (contentType) {
correctExtension = mimeTypes.extension(contentType);
}
// 如果无法从 content-type 获取扩展名,尝试从 URL 获取
if (!correctExtension) {
correctExtension = getFileExtension(imageUrl) || 'png';
}
// 生成最终文件名:如果原始路径没有扩展名或扩展名不正确,则添加/替换
let finalFilename = pathname;
const originalExtension = path.extname(pathname).replace('.', '');
if (!originalExtension || originalExtension !== correctExtension) {
// 移除原有扩展名(如果有)并添加正确的扩展名
const nameWithoutExt = pathname.replace(path.extname(pathname), '');
finalFilename = `${nameWithoutExt}.${correctExtension}`;
}
const filepath = path.join(outputDir, finalFilename);
// 检查文件是否已存在
try {
await fs.access(filepath);
return {
success: true,
filename: finalFilename,
filepath,
skipped: true,
contentType
};
} catch {
// 文件不存在,继续下载
}
// 确保目录存在
const dir = path.dirname(filepath);
await fs.mkdir(dir, { recursive: true });
// 保存文件
const writer = require('fs').createWriteStream(filepath);
response.data.pipe(writer);
await new Promise((resolve, reject) => {
writer.on('finish', resolve);
writer.on('error', reject);
});
const stats = await fs.stat(filepath);
return {
success: true,
filename: finalFilename,
filepath,
size: stats.size,
type: imageInfo.type,
contentType,
skipped: false
};
} catch (error) {
return { success: false, error: error.message };
}
}
// 启动爬虫
await crawler.run([{ url: startUrl, userData: { depth: 1 } }]);
// 生成结果
if (downloadImages) {
console.log(`\n✅ 图片下载完成! 成功下载 ${downloadedImages.length} 个图片`);
if (failedDownloads.length > 0) {
console.log(`❌ 下载失败 ${failedDownloads.length} 个图片`);
}
if (screenshotUrls.length > 0) {
console.log(`📸 页面截图 ${screenshotUrls.length} 个`);
}
const downloadStats = {
total: downloadedImages.length + failedDownloads.length,
success: downloadedImages.length,
failed: failedDownloads.length,
skipped: downloadedImages.filter(img => img.skipped).length,
totalSize: downloadedImages.reduce((sum, img) => sum + (img.size || 0), 0),
screenshots: screenshotUrls.length
};
console.log('\n📊 下载统计:');
console.log(`- 总计: ${downloadStats.total}`);
console.log(`- 成功: ${downloadStats.success}`);
console.log(`- 失败: ${downloadStats.failed}`);
console.log(`- 跳过(已存在): ${downloadStats.skipped}`);
console.log(`- 总大小: ${(downloadStats.totalSize / 1024 / 1024).toFixed(2)} MB`);
if (downloadStats.screenshots > 0) {
console.log(`- 页面截图: ${downloadStats.screenshots}`);
}
if (outputFile) {
const report = {
crawledUrl: startUrl,
crawledAt: new Date().toISOString(),
mode: 'playwright-browser',
config: {
maxDepth,
maxConcurrent,
timeout,
headless,
waitForImages,
captureScreenshots,
detectLazyLoad,
includeBackgrounds,
includeHiddenImages
},
stats: downloadStats,
downloadedImages: downloadedImages.map(img => ({
filename: img.filename,
size: img.size,
type: img.type,
contentType: img.contentType
})),
failedDownloads: failedDownloads.map(img => ({
url: img.url,
type: img.type,
error: img.error
})),
screenshots: screenshotUrls
};
// 将报告文件保存到输出目录中
const reportPath = path.join(outputDir, outputFile);
await saveResults([report], reportPath, 'json');
console.log(`\n📄 爬取报告已保存到: ${path.resolve(reportPath)}`);
}
return { downloadedImages, failedDownloads, stats: downloadStats, screenshots: screenshotUrls };
} else {
const uniqueImages = Array.from(imageUrls).map(JSON.parse);
console.log(`\n✅ 图片爬取完成! 共找到 ${uniqueImages.length} 个唯一图片`);
const typeCounts = uniqueImages.reduce((acc, img) => {
acc[img.type] = (acc[img.type] || 0) + 1;
return acc;
}, {});
console.log('📊 图片类型统计:');
Object.entries(typeCounts).forEach(([type, count]) => {
console.log(`- ${type}: ${count}`);
});
if (screenshotUrls.length > 0) {
console.log(`📸 页面截图: ${screenshotUrls.length} 个`);
}
// 在下载模式下,报告文件也保存到输出目录中
let finalOutputPath = outputFile;
if (downloadImages || captureScreenshots) {
// 确保输出目录存在
await fs.mkdir(outputDir, { recursive: true });
finalOutputPath = path.join(outputDir, outputFile);
}
await saveResults(uniqueImages, finalOutputPath, outputFormat);
console.log(`\n📄 结果已保存到: ${path.resolve(finalOutputPath)}`);
return { images: uniqueImages, screenshots: screenshotUrls };
}
}
// 保存结果函数
async function saveResults(images, outputFile, format) {
let content;
switch (format.toLowerCase()) {
case 'json':
content = JSON.stringify(images, null, 2);
break;
case 'csv':
const headers = 'URL,Type,Element,Alt,Width,Height,Visible,CrawledFrom,CrawledAt,Depth\n';
const rows = images.map(img =>
`"${img.url}","${img.type}","${img.element}","${img.alt || ''}","${img.width || ''}","${img.height || ''}","${img.visible}","${img.crawledFrom}","${img.crawledAt}","${img.depth}"`
).join('\n');
content = headers + rows;
break;
case 'txt':
default:
content = images.map(img =>
`${img.url}${img.alt ? ` (ALT: ${img.alt})` : ''} [${img.type}] [D:${img.depth}]`
).join('\n');
break;
}
await fs.writeFile(outputFile, content, 'utf-8');
}
module.exports = crawlImages;
if (require.main === module) {
(async () => {
const startUrl = 'https://eazegames.com/solitaire';
const maxDepth = 4;
const outputDir = './downloaded_images';
console.log(`🚀 开始强大图片爬取 ${startUrl} (最大深度: ${maxDepth})`);
// 使用强大的浏览器模式
await crawlImages(startUrl, {
maxDepth,
outputFile: 'playwright_report11.json',
downloadImages: true,
outputDir,
captureScreenshots: true,
detectLazyLoad: true,
waitForImages: true,
includeBackgrounds: true,
includeHiddenImages: false
});
})();
}