sameweb-cli
Version:
A CLI tool to clone websites into local folders using Puppeteer and AI. Useful for developers, researchers, and learners who want to quickly grab a site's structure and assets.
325 lines (278 loc) • 10.2 kB
JavaScript
import puppeteer from "puppeteer";
import fs from "fs";
import path from "path";
import axios from "axios";
import beautify from "js-beautify";
function ensureDir(dir) {
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir, { recursive: true });
}
}
async function autoScroll(page) {
await page.evaluate(async () => {
await new Promise((resolve) => {
let totalHeight = 0;
const distance = 1000;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(timer);
resolve();
}
}, 1000);
});
});
}
// Handling iframe content
async function processIframes(page) {
try {
const iframes = await page.$$("iframe");
// console.log(`Found ${iframes.length} iframes`);
for (let i = 0; i < iframes.length; i++) {
try {
const iframe = iframes[i];
const src = await iframe.evaluate((el) => el.src);
if (src && !src.startsWith("data:") && !src.startsWith("blob:")) {
// console.log(`Processing iframe: ${src}`);
// Try to access iframe content if it's from the same origin
try {
const frame = await iframe.contentFrame();
if (frame) {
await frame.waitForLoadState("domcontentloaded", {
timeout: 10000,
});
await autoScroll(frame);
}
} catch (frameError) {
// console.log(`⚠️ Iframe ${src} - Cross-origin or access denied:`, frameError.message);
}
}
} catch (iframeError) {
// console.log(`⚠️ Error processing iframe ${i}:`, iframeError.message);
}
}
} catch (error) {
// console.log('⚠️ Error finding iframes:', error.message);
}
}
// Download asset
async function downloadAsset(assetUrl, directory, maxRetries = 3) {
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
// Try original URL first
let urlsToTry = [assetUrl];
// For Next.js chunks with dpl parameters that return 404, try without dpl
if (
assetUrl.includes("?dpl=") &&
assetUrl.includes("_next/static/chunks/")
) {
const urlWithoutDpl = assetUrl.split("?dpl=")[0];
urlsToTry.push(urlWithoutDpl);
}
// For other query parameters, also try without them
if (
assetUrl.includes("?") &&
!urlsToTry.includes(assetUrl.split("?")[0])
) {
urlsToTry.push(assetUrl.split("?")[0]);
}
let lastError;
for (const tryUrl of urlsToTry) {
try {
const response = await axios.get(tryUrl, {
responseType: "arraybuffer",
timeout: 30000,
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
},
});
const parsedUrl = new URL(tryUrl);
let filePath = path.join(
directory,
parsedUrl.pathname.replace(/^\//, ""),
);
// Handle files without extensions or with query parameters
if (!path.extname(filePath)) {
const contentType = response.headers["content-type"];
if (contentType?.includes("javascript")) {
filePath += ".js";
} else if (contentType?.includes("css")) {
filePath += ".css";
} else if (contentType?.includes("image")) {
const ext = contentType.split("/")[1];
filePath += `.${ext}`;
}
}
// Remove query parameters from file path
filePath = filePath.split("?")[0];
ensureDir(path.dirname(filePath));
fs.writeFileSync(filePath, response.data);
// console.log(`✅ Downloaded: ${tryUrl} ${tryUrl !== assetUrl ? '(fallback)' : ''}`);
return {
success: true,
originalUrl: assetUrl,
actualUrl: tryUrl,
localPath: parsedUrl.pathname.replace(/^\//, "").split("?")[0],
};
} catch (urlError) {
lastError = urlError;
// console.log(`⚠️ Failed ${tryUrl}: ${urlError.response?.status || urlError.message}`);
continue;
}
}
throw lastError;
} catch (err) {
// console.log(`❌ Attempt ${attempt}/${maxRetries} failed for ${assetUrl}:`, err.response?.status || err.message);
if (attempt === maxRetries) {
return {
success: false,
originalUrl: assetUrl,
error: err.response?.status || err.message,
};
}
// Wait before retry
await new Promise((resolve) => setTimeout(resolve, 1000 * attempt));
}
}
}
export const getScrapeWebsite = async (url, directory, options = {}) => {
const {
ignoreFailedAssets = true,
timeout = 60000,
waitForNetworkIdle = true,
processIframesEnabled = true,
} = options;
let browser;
try {
browser = await puppeteer.launch({
headless: true,
args: [
"--start-maximized",
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-web-security",
"--disable-features=VizDisplayCompositor",
],
defaultViewport: null,
});
const page = await browser.newPage();
await page.setUserAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
);
const delay = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
console.log(`🚀 Starting to scrape: ${url}`);
try {
await page.goto(url, {
waitUntil: waitForNetworkIdle ? "networkidle2" : "domcontentloaded",
timeout: timeout,
});
} catch (navigationError) {
// console.log(`⚠️ Navigation warning: ${navigationError.message}`);
// Continue anyway, sometimes the page loads despite timeout
}
await delay(5000);
if (processIframesEnabled) {
await processIframes(page);
}
// console.log('📜 Auto-scrolling to load dynamic content...');
await autoScroll(page);
await delay(2000);
let rawHtml = await page.content();
// Format HTML
let html = beautify.html(rawHtml, {
indent_size: 2,
preserve_newlines: true,
end_with_newline: true,
});
// html = html.replace(/"\/_next\//g, '"./_next/');
const assets = await page.evaluate(() => {
const urls = [];
// Images
document.querySelectorAll("img[src]").forEach((el) => {
if (el.src && !el.src.startsWith("data:")) urls.push(el.src);
});
// Stylesheets
document.querySelectorAll("link[href]").forEach((el) => {
if (el.href && !el.href.startsWith("data:")) urls.push(el.href);
});
// Scripts
document.querySelectorAll("script[src]").forEach((el) => {
if (el.src && !el.src.startsWith("data:")) urls.push(el.src);
});
// Background images from CSS
const elementsWithBgImage = document.querySelectorAll("*");
elementsWithBgImage.forEach((el) => {
const bgImage = window.getComputedStyle(el).backgroundImage;
if (bgImage && bgImage !== "none") {
const match = bgImage.match(/url\(['"]?([^'"]+)['"]?\)/);
if (match && match[1] && !match[1].startsWith("data:")) {
urls.push(
match[1].startsWith("http")
? match[1]
: new URL(match[1], window.location.origin).href,
);
}
}
});
return [...new Set(urls)];
});
// console.log(`🔍 Found ${assets.length} assets to download`);
// Download assets with error handling
const downloadResults = [];
const failedDownloads = [];
for (const assetUrl of assets) {
const result = await downloadAsset(assetUrl, directory);
downloadResults.push(result);
if (result.success) {
try {
const regex = new RegExp(
result.originalUrl.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"),
"g",
);
html = html.replace(regex, result.localPath);
} catch (replaceError) {
console.log(
`⚠️ Error replacing URL in HTML: ${replaceError.message}`,
);
}
} else {
failedDownloads.push(result);
if (!ignoreFailedAssets) {
throw new Error(
`Failed to download critical asset: ${result.originalUrl}`,
);
}
}
}
ensureDir(directory);
fs.writeFileSync(`${directory}/index.html`, html);
// console.log(`✅ Successfully downloaded: ${report.successfulDownloads}/${report.totalAssets} assets`);
if (failedDownloads.length > 0) {
//console.log(`❌ Failed downloads: ${failedDownloads.length}`);
failedDownloads.forEach((f) =>
console.log(` - ${f.originalUrl}: ${f.error}`),
);
}
console.log(`📁 Files saved to: ${directory}`);
} catch (error) {
console.error("💥 Fatal error during scraping:", error.message);
throw error;
} finally {
if (browser) {
await browser.close();
}
}
};
// Usage examples:
// Basic usage (ignores failed assets by default)
// getScrapeWebsite("https://hitesh.ai", "hiteshai");
// With custom options
// getScrapeWebsite("https://hitesh.ai", "hiteshai", {
// ignoreFailedAssets: true, // Continue scraping even if some assets fail
// timeout: 30000, // 30 second timeout for page load
// waitForNetworkIdle: true, // Wait for network to be idle
// processIframesEnabled: true // Process iframe content
// });