UNPKG

@pinkpixel/prysm-llm

Version:

Structure-focused web scraper optimized for LLMs for use with MCP with minimal console output. Configure output directory with PRYSM_OUTPUT_DIR environment variable.

515 lines (463 loc) 15.3 kB
// test_scraper.js - Test runner for the Prysm scraper const { scrape } = require('./main_scraper'); const fs = require('fs').promises; const path = require('path'); const chalk = require('chalk'); /** * Display a colorful banner */ function displayBanner() { console.log(chalk.magentaBright(` ██████╗ ██████╗ ██╗ ██╗███████╗███╗ ███╗ ██╔══██╗██╔══██╗╚██╗ ██╔╝██╔════╝████╗ ████║ ██████╔╝██████╔╝ ╚████╔╝ ███████╗██╔████╔██║ ██╔═══╝ ██╔══██╗ ╚██╔╝ ╚════██║██║╚██╔╝██║ ██║ ██║ ██║ ██║ ███████║██║ ╚═╝ ██║ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝`)); console.log(chalk.cyanBright(` ✨ TEST RUNNER ------------------------------`)); } /** * Parse command line arguments */ function parseArgs() { const args = process.argv.slice(2); const options = { mode: 'all', // Default to testing all sites target: null, scrapeImages: false, downloadImages: false, maxSites: Infinity, outputDir: path.join(__dirname, 'test_results'), showHelp: false, verbose: false }; for (let i = 0; i < args.length; i++) { const arg = args[i]; const nextArg = args[i + 1]; switch (arg) { case '--help': case '-h': options.showHelp = true; break; case '--category': case '-c': options.mode = 'category'; options.target = nextArg; i++; break; case '--name': case '-n': options.mode = 'name'; options.target = nextArg; i++; break; case '--scrape-images': options.scrapeImages = true; break; case '--download-images': options.scrapeImages = true; // Must scrape to download options.downloadImages = true; break; case '--max': options.maxSites = parseInt(nextArg, 10) || Infinity; i++; break; case '--output': case '-o': options.outputDir = nextArg; i++; break; case '--verbose': case '-v': options.verbose = true; break; } } return options; } /** * Display help message */ function showHelp() { console.log(` ${chalk.whiteBright('Usage:')} ${chalk.greenBright('node test_scraper.js')} ${chalk.yellowBright('[options]')} ${chalk.whiteBright('Test Modes:')} ${chalk.yellowBright('--category, -c <category>')} Test sites from a specific category Categories: recipe, article, blog, news, technical, documentation, wiki, product, travel, social, government, legal, education, entertainment ${chalk.yellowBright('--name, -n <site_name>')} Test a specific site by name ${chalk.whiteBright('Options:')} ${chalk.yellowBright('--scrape-images')} Enable image scraping ${chalk.yellowBright('--download-images')} Download images (enables scraping) ${chalk.yellowBright('--max <number>')} Maximum number of sites to test ${chalk.yellowBright('--output, -o <directory>')} Output directory for results ${chalk.yellowBright('--verbose, -v')} Show detailed logs ${chalk.yellowBright('--help, -h')} Show this help message ${chalk.whiteBright('Examples:')} ${chalk.greenBright('node test_scraper.js')} Test all sites ${chalk.greenBright('node test_scraper.js')} ${chalk.blueBright('--category news')} Test news sites only ${chalk.greenBright('node test_scraper.js')} ${chalk.blueBright('--name "BBC News"')} Test specific site ${chalk.greenBright('node test_scraper.js')} ${chalk.blueBright('--category recipe --max 2')} Test 2 recipe sites ${chalk.greenBright('node test_scraper.js')} ${chalk.blueBright('--download-images')} Download images for all sites `); } // Create a comprehensive test suite combining sites from various categories const ALL_TEST_SITES = [ // Recipe Sites { name: 'AllRecipes', url: 'https://www.allrecipes.com/recipe/256618/cubanelle-and-veal-bolognese/', category: 'recipe' }, { name: '101 Cookbooks', url: 'https://www.101cookbooks.com/', category: 'recipe' }, { name: 'Smitten Kitchen', url: 'https://smittenkitchen.com/', category: 'recipe' }, { name: 'Serious Eats', url: 'https://www.seriouseats.com/classic-panzanella-salad-recipe', category: 'recipe' }, { name: 'Food Network', url: 'https://www.foodnetwork.com/recipes/food-network-kitchen/instant-pot-barbecue-pulled-pork-sandwiches-8306825', category: 'recipe' }, { name: 'BBC Good Food', url: 'https://www.bbcgoodfood.com/recipes/easy-chocolate-cake', category: 'recipe' }, // Blog/Article Sites { name: 'Medium Article', url: 'https://medium.com/blog/what-i-wish-i-knew-navigating-the-geography-of-a-creative-life-requires-a-compass-not-a-map-b7f10afdbd6a', category: 'article' }, { name: 'Wix Site', url: 'https://robmensching.com/blog/', category: 'blog' }, { name: 'WordPress Blog', url: 'https://citizenwells.com/2022/02/20/everyone-is-at-risk-for-blood-clots-cdc-and-pfizer-try-to-normalize-diffuse-widespread-stories-of-athletes-collapsing-and-dying-after-covid-jabs-athletes-collapsing-on-field-are/', category: 'blog' }, { name: 'Blogger', url: 'https://althouse.blogspot.com/2025/04/would-american-public-stand-for-it-it.html', category: 'blog' }, { name: 'Substack', url: 'https://substack.com/browse/staff-picks/post/150741708', category: 'newsletter' }, // News Sites { name: 'BBC News', url: 'https://www.bbc.com/news/articles/c0m90jjewd7o', category: 'news' }, { name: 'CNN', url: 'https://www.cnn.com/2025/04/02/health/cory-booker-prepared-his-body-speech/index.html', category: 'news' }, { name: 'New York Times', url: 'https://www.nytimes.com/live/2025/04/02/business/trump-tariffs-liberation-day', category: 'news' }, { name: 'The Verge', url: 'https://www.theverge.com/news/642049/nintendo-switch-2-verge-staff-reacts', category: 'news' }, // Technical Sites { name: 'Stack Overflow', url: 'https://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-processing-an-unsorted-array', category: 'technical' }, { name: 'GitHub Readme', url: 'https://github.com/puppeteer/puppeteer', category: 'technical' }, { name: 'TechHub', url: 'https://techhub.iodigital.com/articles/what-is-webllm?ref=dailydev', category: 'technical' }, // Documentation Sites { name: 'MDN Web Docs', url: 'https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map', category: 'documentation' }, { name: 'Unsloth Docs', url: 'https://docs.unsloth.ai/', category: 'documentation' }, { name: 'Cursor Docs', url: 'https://docs.cursor.com/get-started/welcome', category: 'documentation' }, // Wiki Sites { name: 'Wikipedia', url: 'https://en.wikipedia.org/wiki/Artificial_intelligence', category: 'wiki' }, { name: 'WikiHow', url: 'https://www.wikihow.com/Make-a-Tree-Seat', category: 'wiki' }, { name: 'WikiBooks', url: 'https://en.wikibooks.org/wiki/PSP_Programming/Text_Menu', category: 'wiki' }, // Product Sites { name: 'macys', url: 'https://www.macys.com/shop/product/franco-sarto-womens-marlina-fisherman-pointed-toe-kitten-heel-mules?ID=19490453', category: 'product' }, { name: 'Amazon Product', url: 'https://www.amazon.com/Paris-Hydrating-Dehydrated-Hyaluronic-Paraben-Free/dp/B0BCR23QDG', category: 'ecommerce' }, // Travel Sites { name: 'Lonely Planet', url: 'https://www.lonelyplanet.com/articles/best-places-to-visit-in-japan', category: 'travel' }, { name: 'TripAdvisor', url: 'https://www.tripadvisor.com/Tourism-g60763-New_York_City_New_York-Vacations.html', category: 'travel' }, // Social Media Sites { name: 'CigarScanner', url: 'https://www.cigarscanner.com/tabs/social/users/ebff7911-0be1-4e84-8211-ce7930a508d2', category: 'social' }, { name: 'Instagram', url: 'https://www.instagram.com/neogurumi/', category: 'social' }, // Government and Legal Sites { name: 'White House', url: 'https://www.whitehouse.gov/', category: 'government' }, { name: 'New Hampshire', url: 'https://gc.nh.gov/rsa/html/LV/540/540-mrg.htm', category: 'government' }, { name: 'Alabama Law', url: 'https://law.justia.com/codes/alabama/title-28/chapter-1/section-28-1-5/', category: 'legal' }, // Education Sites { name: 'Coursera', url: 'https://www.coursera.org/degrees/ms-computer-science-boulder', category: 'education' }, { name: 'Rasmussen', url: 'https://www.rasmussen.edu/student-experience/college-life/15-educational-search-engines/', category: 'education' }, { name: 'Harvard', url: 'https://news.harvard.edu/gazette/story/2024/08/on-move-in-day-hugs-sweat-and-tears/', category: 'education' }, // Entertainment Sites { name: 'IMDB', url: 'https://www.imdb.com/title/tt0117500/', category: 'entertainment' }, { name: 'Rotten Tomatoes', url: 'https://www.rottentomatoes.com/m/the_dark_knight', category: 'entertainment' }, { name: 'Metacritic', url: 'https://www.metacritic.com/game/pc/the-witcher-3-wild-hunt', category: 'entertainment' }, ]; /** * Get the list of sites to test based on options */ function getSitesToTest(options) { let sitesToTest = []; switch (options.mode) { case 'category': sitesToTest = ALL_TEST_SITES.filter(site => site.category === options.target); break; case 'name': sitesToTest = ALL_TEST_SITES.filter(site => site.name.toLowerCase() === options.target.toLowerCase()); break; case 'all': default: sitesToTest = ALL_TEST_SITES; break; } // Apply max sites limit if (options.maxSites < sitesToTest.length) { sitesToTest = sitesToTest.slice(0, options.maxSites); } return sitesToTest; } /** * Run tests for the selected sites */ async function runTests(options) { const sitesToTest = getSitesToTest(options); if (sitesToTest.length === 0) { console.log(chalk.redBright('❌ No sites found matching your criteria')); process.exit(1); } console.log(chalk.blueBright(` 🔍 Running tests for ${chalk.whiteBright(sitesToTest.length)} sites ${options.scrapeImages ? '📸 Image scraping: ' + chalk.greenBright('enabled') : '📸 Image scraping: ' + chalk.redBright('disabled')} ${options.downloadImages ? '📥 Image downloading: ' + chalk.greenBright('enabled') : '📥 Image downloading: ' + chalk.redBright('disabled')} ${chalk.yellowBright('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━')} `)); // Create output directory try { await fs.mkdir(options.outputDir, { recursive: true }); } catch (error) { console.error(chalk.redBright(`❌ Error creating output directory: ${error.message}`)); process.exit(1); } // Create timestamp for this test run const timestamp = new Date().toISOString().replace(/:/g, '-'); const runDir = path.join(options.outputDir, `run_${timestamp}`); await fs.mkdir(runDir, { recursive: true }); // Create a summary file const summaryData = { timestamp, options, sites: sitesToTest.map(site => ({ name: site.name, url: site.url, category: site.category })), results: [] }; // Test each site for (let i = 0; i < sitesToTest.length; i++) { const site = sitesToTest[i]; const siteNumber = i + 1; console.log(chalk.cyanBright(` 🌐 [${siteNumber}/${sitesToTest.length}] Testing ${chalk.whiteBright(site.name)} (${chalk.yellowBright(site.category)}) ${chalk.whiteBright(site.url)} `)); const startTime = Date.now(); try { // Create options for the scraper const scraperOptions = { maxScrolls: 20, scrollDelay: 1000, headless: true, scrapeImages: options.scrapeImages, downloadImages: options.downloadImages, output: runDir }; // Run the scraper (using scrape instead of mainScraper) const result = await scrape(site.url, scraperOptions); const endTime = Date.now(); const duration = ((endTime - startTime) / 1000).toFixed(2); console.log(chalk.greenBright(` ✅ ${chalk.whiteBright(site.name)} completed in ${chalk.yellowBright(duration + 's')} 📝 Content items: ${chalk.whiteBright(result.content?.length || 0)} 📸 Images: ${chalk.whiteBright(result.images?.length || 0)} `)); // Add to summary summaryData.results.push({ name: site.name, url: site.url, category: site.category, success: true, duration: parseFloat(duration), contentCount: result.content?.length || 0, imageCount: result.images?.length || 0 }); } catch (error) { console.error(chalk.redBright(` ❌ Error testing ${chalk.whiteBright(site.name)}: ${error.message} `)); // Add failure to summary summaryData.results.push({ name: site.name, url: site.url, category: site.category, success: false, error: error.message }); } } // Save summary file const summaryFile = path.join(runDir, 'summary.json'); await fs.writeFile(summaryFile, JSON.stringify(summaryData, null, 2)); // Display test summary const successCount = summaryData.results.filter(r => r.success).length; const failureCount = summaryData.results.length - successCount; console.log(chalk.cyanBright(` 📊 Test Summary ${chalk.yellowBright('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━')} ✅ Success: ${chalk.greenBright(successCount)} ❌ Failure: ${chalk.redBright(failureCount)} ⏱️ Total sites tested: ${chalk.whiteBright(summaryData.results.length)} 📁 Results saved to: ${chalk.whiteBright(runDir)} ${chalk.yellowBright('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━')} `)); } /** * Main function */ async function main() { displayBanner(); const options = parseArgs(); if (options.showHelp) { showHelp(); return; } await runTests(options); } // Run the script main().catch(error => { console.error(chalk.redBright(` ❌ Fatal error: ${error.message} ${error.stack} `)); process.exit(1); });