scrapegraph-js
Version:
Scrape and extract structured data from a webpage using ScrapeGraphAI's APIs. Supports cookies for authentication, infinite scrolling, and pagination.
270 lines (232 loc) • 9.84 kB
JavaScript
/**
* Example script demonstrating the ScrapeGraphAI Crawler markdown conversion mode.
*
* This example shows how to use the crawler in markdown conversion mode:
* - Cost-effective markdown conversion (NO AI/LLM processing)
* - 2 credits per page (80% savings compared to AI mode)
* - Clean HTML to markdown conversion with metadata extraction
*
* Requirements:
* - Node.js 14+
* - dotenv
* - A .env file with your API_KEY
*
* Example .env file:
* API_KEY=your_api_key_here
*/
import 'dotenv/config';
// Configuration - API key from environment or fallback
const API_KEY = process.env.TEST_API_KEY || "sgai-xxx"; // Load from .env file
const BASE_URL = process.env.BASE_URL || "http://localhost:8001"; // Can be overridden via env
/**
* Make an HTTP request to the API.
* @param {string} url - The URL to make the request to
* @param {Object} data - The data to send in the request body
* @returns {Promise<Object>} The response JSON
*/
async function makeRequest(url, data) {
const headers = {
"Content-Type": "application/json",
"SGAI-APIKEY": API_KEY
};
const response = await fetch(url, {
method: 'POST',
headers: headers,
body: JSON.stringify(data)
});
return await response.json();
}
/**
* Poll for the result of a crawl job with rate limit handling.
* @param {string} taskId - The task ID to poll for
* @returns {Promise<Object>} The response JSON
*/
async function pollResult(taskId) {
const headers = { "SGAI-APIKEY": API_KEY };
const url = `${BASE_URL}/v1/crawl/${taskId}`;
const response = await fetch(url, {
method: 'GET',
headers: headers
});
if (response.status === 429) {
// Rate limited - return special status to handle in polling loop
return { status: "rate_limited", retry_after: 60 };
}
return await response.json();
}
/**
* Poll for crawl results with intelligent backoff to avoid rate limits.
* @param {string} taskId - The task ID to poll for
* @param {number} maxAttempts - Maximum number of polling attempts
* @returns {Promise<Object>} The final result or throws an exception on timeout/failure
*/
async function pollWithBackoff(taskId, maxAttempts = 20) {
console.log("⏳ Starting to poll for results with rate-limit protection...");
// Initial wait to give the job time to start processing
await new Promise(resolve => setTimeout(resolve, 15000));
for (let attempt = 0; attempt < maxAttempts; attempt++) {
try {
const result = await pollResult(taskId);
const status = result.status;
if (status === "rate_limited") {
const waitTime = Math.min(90, 30 + (attempt * 10)); // Exponential backoff for rate limits
console.log(`⚠️ Rate limited! Waiting ${waitTime}s before retry...`);
await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
continue;
} else if (status === "success") {
return result;
} else if (status === "failed") {
throw new Error(`Crawl failed: ${result.error || 'Unknown error'}`);
} else {
// Calculate progressive wait time: start at 15s, increase gradually
const baseWait = 15;
const progressiveWait = Math.min(60, baseWait + (attempt * 3)); // Cap at 60s
console.log(`⏳ Status: ${status} (attempt ${attempt + 1}/${maxAttempts}) - waiting ${progressiveWait}s...`);
await new Promise(resolve => setTimeout(resolve, progressiveWait * 1000));
}
} catch (error) {
if (error.message.toLowerCase().includes('rate') || error.message.includes('429')) {
const waitTime = Math.min(90, 45 + (attempt * 10));
console.log(`⚠️ Rate limit detected in error, waiting ${waitTime}s...`);
await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
continue;
} else {
console.log(`❌ Error polling for results: ${error.message}`);
if (attempt < maxAttempts - 1) {
await new Promise(resolve => setTimeout(resolve, 20000)); // Wait before retry
continue;
}
throw error;
}
}
}
throw new Error(`⏰ Timeout: Job did not complete after ${maxAttempts} attempts`);
}
/**
* Markdown Conversion Mode (NO AI/LLM Used)
*
* This example demonstrates cost-effective crawling that converts pages to clean markdown
* WITHOUT any AI processing. Perfect for content archival and when you only need clean markdown.
*/
async function markdownCrawlingExample() {
console.log("=".repeat(60));
console.log("MARKDOWN CONVERSION MODE (NO AI/LLM)");
console.log("=".repeat(60));
console.log("Use case: Get clean markdown content without AI processing");
console.log("Cost: 2 credits per page (80% savings!)");
console.log("Features: Clean markdown conversion, metadata extraction");
console.log("⚠️ NO AI/LLM PROCESSING - Pure HTML to markdown conversion only!");
console.log();
// Markdown conversion request - NO AI/LLM processing
const requestData = {
url: "https://scrapegraphai.com/",
extraction_mode: false, // FALSE = Markdown conversion mode (NO AI/LLM used)
depth: 2,
max_pages: 2,
same_domain_only: true,
sitemap: false, // Use sitemap for better coverage
// Note: No prompt needed when extraction_mode = false
};
console.log(`🌐 Target URL: ${requestData.url}`);
console.log("🤖 AI Prompt: None (no AI processing)");
console.log(`📊 Crawl Depth: ${requestData.depth}`);
console.log(`📄 Max Pages: ${requestData.max_pages}`);
console.log(`🗺️ Use Sitemap: ${requestData.sitemap}`);
console.log("💡 Mode: Pure HTML to markdown conversion");
console.log();
// Start the markdown conversion job
console.log("🚀 Starting markdown conversion job...");
const response = await makeRequest(`${BASE_URL}/v1/crawl`, requestData);
const taskId = response.task_id;
if (!taskId) {
console.log("❌ Failed to start markdown conversion job");
return;
}
console.log(`📋 Task ID: ${taskId}`);
console.log("⏳ Polling for results...");
console.log();
// Poll for results with rate-limit protection
try {
const result = await pollWithBackoff(taskId, 20);
console.log("✅ Markdown conversion completed successfully!");
console.log();
const resultData = result.result || {};
const pages = resultData.pages || [];
const crawledUrls = resultData.crawled_urls || [];
const creditsUsed = resultData.credits_used || 0;
const pagesProcessed = resultData.pages_processed || 0;
console.log("📊 CONVERSION RESULTS:");
console.log("-".repeat(40));
console.log(`📄 Pages processed: ${pagesProcessed}`);
console.log(`💰 Credits used: ${creditsUsed}`);
console.log(`💵 Cost per page: ${pagesProcessed > 0 ? (creditsUsed / pagesProcessed).toFixed(1) : 0} credits`);
if (crawledUrls.length > 0) {
console.log(`🔗 URLs processed: ${JSON.stringify(crawledUrls)}`);
}
console.log();
console.log("📝 MARKDOWN CONTENT:");
console.log("-".repeat(40));
if (pages.length > 0) {
console.log(`📄 Total pages with markdown: ${pages.length}`);
pages.slice(0, 3).forEach((page, i) => { // Show first 3 pages
console.log(`\n📄 Page ${i + 1}:`);
console.log(` URL: ${page.url || 'N/A'}`);
console.log(` Title: ${page.title || 'None'}`);
const metadata = page.metadata || {};
console.log(` 📊 Word count: ${metadata.word_count || 0}`);
console.log(` 📋 Headers: ${JSON.stringify((metadata.headers || []).slice(0, 3))}`); // First 3 headers
console.log(` 🔗 Links: ${metadata.links_count || 0}`);
// Show markdown preview
const markdownContent = page.markdown || "";
let markdownPreview = markdownContent.substring(0, 200);
if (markdownContent.length > 200) {
markdownPreview += "...";
}
console.log(` 📝 Content preview: ${markdownPreview}`);
});
if (pages.length > 3) {
console.log(`\n ... and ${pages.length - 3} more pages with markdown content`);
}
} else {
console.log("No markdown content available");
}
} catch (error) {
console.log(`❌ Markdown conversion failed: ${error.message}`);
}
}
/**
* Main function to run the markdown crawling example.
*/
async function main() {
console.log("🌐 ScrapeGraphAI Crawler - Markdown Conversion Example");
console.log("Cost-effective HTML to Markdown conversion (NO AI/LLM)");
console.log("=".repeat(60));
// Check if API key is set
if (API_KEY === "sgai-xxx") {
console.log("⚠️ Please set your API key in the .env file");
console.log(" Create a .env file with your API key:");
console.log(" API_KEY=your_api_key_here");
console.log();
console.log(" You can get your API key from: https://dashboard.scrapegraphai.com");
console.log();
console.log(" Example .env file:");
console.log(" API_KEY=sgai-your-actual-api-key-here");
console.log(" BASE_URL=https://api.scrapegraphai.com # Optional");
return;
}
console.log(`🔑 Using API key: ${API_KEY.substring(0, 10)}...`);
console.log(`🌐 Base URL: ${BASE_URL}`);
console.log();
// Run the single example
await markdownCrawlingExample(); // Markdown conversion mode (NO AI)
console.log("\n" + "=".repeat(60));
console.log("🎉 Example completed!");
console.log("💡 This demonstrates markdown conversion mode:");
console.log(" • Cost-effective: Only 2 credits per page");
console.log(" • No AI/LLM processing - pure HTML to markdown conversion");
console.log(" • Perfect for content archival and documentation");
console.log(" • 80% cheaper than AI extraction modes!");
}
// Run the example
main().catch(console.error);