@pinkpixel/prysm-llm
Version:
Structure-focused web scraper optimized for LLMs for use with MCP with minimal console output. Configure output directory with PRYSM_OUTPUT_DIR environment variable.
515 lines (463 loc) • 15.3 kB
JavaScript
// test_scraper.js - Test runner for the Prysm scraper
const { scrape } = require('./main_scraper');
const fs = require('fs').promises;
const path = require('path');
const chalk = require('chalk');
/**
* Display a colorful banner
*/
function displayBanner() {
console.log(chalk.magentaBright(`
██████╗ ██████╗ ██╗ ██╗███████╗███╗ ███╗
██╔══██╗██╔══██╗╚██╗ ██╔╝██╔════╝████╗ ████║
██████╔╝██████╔╝ ╚████╔╝ ███████╗██╔████╔██║
██╔═══╝ ██╔══██╗ ╚██╔╝ ╚════██║██║╚██╔╝██║
██║ ██║ ██║ ██║ ███████║██║ ╚═╝ ██║
╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝`));
console.log(chalk.cyanBright(`
✨ TEST RUNNER
------------------------------`));
}
/**
* Parse command line arguments
*/
function parseArgs() {
const args = process.argv.slice(2);
const options = {
mode: 'all', // Default to testing all sites
target: null,
scrapeImages: false,
downloadImages: false,
maxSites: Infinity,
outputDir: path.join(__dirname, 'test_results'),
showHelp: false,
verbose: false
};
for (let i = 0; i < args.length; i++) {
const arg = args[i];
const nextArg = args[i + 1];
switch (arg) {
case '--help':
case '-h':
options.showHelp = true;
break;
case '--category':
case '-c':
options.mode = 'category';
options.target = nextArg;
i++;
break;
case '--name':
case '-n':
options.mode = 'name';
options.target = nextArg;
i++;
break;
case '--scrape-images':
options.scrapeImages = true;
break;
case '--download-images':
options.scrapeImages = true; // Must scrape to download
options.downloadImages = true;
break;
case '--max':
options.maxSites = parseInt(nextArg, 10) || Infinity;
i++;
break;
case '--output':
case '-o':
options.outputDir = nextArg;
i++;
break;
case '--verbose':
case '-v':
options.verbose = true;
break;
}
}
return options;
}
/**
* Display help message
*/
function showHelp() {
console.log(`
${chalk.whiteBright('Usage:')} ${chalk.greenBright('node test_scraper.js')} ${chalk.yellowBright('[options]')}
${chalk.whiteBright('Test Modes:')}
${chalk.yellowBright('--category, -c <category>')} Test sites from a specific category
Categories: recipe, article, blog, news, technical,
documentation, wiki, product, travel, social, government,
legal, education, entertainment
${chalk.yellowBright('--name, -n <site_name>')} Test a specific site by name
${chalk.whiteBright('Options:')}
${chalk.yellowBright('--scrape-images')} Enable image scraping
${chalk.yellowBright('--download-images')} Download images (enables scraping)
${chalk.yellowBright('--max <number>')} Maximum number of sites to test
${chalk.yellowBright('--output, -o <directory>')} Output directory for results
${chalk.yellowBright('--verbose, -v')} Show detailed logs
${chalk.yellowBright('--help, -h')} Show this help message
${chalk.whiteBright('Examples:')}
${chalk.greenBright('node test_scraper.js')} Test all sites
${chalk.greenBright('node test_scraper.js')} ${chalk.blueBright('--category news')} Test news sites only
${chalk.greenBright('node test_scraper.js')} ${chalk.blueBright('--name "BBC News"')} Test specific site
${chalk.greenBright('node test_scraper.js')} ${chalk.blueBright('--category recipe --max 2')} Test 2 recipe sites
${chalk.greenBright('node test_scraper.js')} ${chalk.blueBright('--download-images')} Download images for all sites
`);
}
// Create a comprehensive test suite combining sites from various categories
const ALL_TEST_SITES = [
// Recipe Sites
{
name: 'AllRecipes',
url: 'https://www.allrecipes.com/recipe/256618/cubanelle-and-veal-bolognese/',
category: 'recipe'
},
{
name: '101 Cookbooks',
url: 'https://www.101cookbooks.com/',
category: 'recipe'
},
{
name: 'Smitten Kitchen',
url: 'https://smittenkitchen.com/',
category: 'recipe'
},
{
name: 'Serious Eats',
url: 'https://www.seriouseats.com/classic-panzanella-salad-recipe',
category: 'recipe'
},
{
name: 'Food Network',
url: 'https://www.foodnetwork.com/recipes/food-network-kitchen/instant-pot-barbecue-pulled-pork-sandwiches-8306825',
category: 'recipe'
},
{
name: 'BBC Good Food',
url: 'https://www.bbcgoodfood.com/recipes/easy-chocolate-cake',
category: 'recipe'
},
// Blog/Article Sites
{
name: 'Medium Article',
url: 'https://medium.com/blog/what-i-wish-i-knew-navigating-the-geography-of-a-creative-life-requires-a-compass-not-a-map-b7f10afdbd6a',
category: 'article'
},
{
name: 'Wix Site',
url: 'https://robmensching.com/blog/',
category: 'blog'
},
{
name: 'WordPress Blog',
url: 'https://citizenwells.com/2022/02/20/everyone-is-at-risk-for-blood-clots-cdc-and-pfizer-try-to-normalize-diffuse-widespread-stories-of-athletes-collapsing-and-dying-after-covid-jabs-athletes-collapsing-on-field-are/',
category: 'blog'
},
{
name: 'Blogger',
url: 'https://althouse.blogspot.com/2025/04/would-american-public-stand-for-it-it.html',
category: 'blog'
},
{
name: 'Substack',
url: 'https://substack.com/browse/staff-picks/post/150741708',
category: 'newsletter'
},
// News Sites
{
name: 'BBC News',
url: 'https://www.bbc.com/news/articles/c0m90jjewd7o',
category: 'news'
},
{
name: 'CNN',
url: 'https://www.cnn.com/2025/04/02/health/cory-booker-prepared-his-body-speech/index.html',
category: 'news'
},
{
name: 'New York Times',
url: 'https://www.nytimes.com/live/2025/04/02/business/trump-tariffs-liberation-day',
category: 'news'
},
{
name: 'The Verge',
url: 'https://www.theverge.com/news/642049/nintendo-switch-2-verge-staff-reacts',
category: 'news'
},
// Technical Sites
{
name: 'Stack Overflow',
url: 'https://stackoverflow.com/questions/11227809/why-is-processing-a-sorted-array-faster-than-processing-an-unsorted-array',
category: 'technical'
},
{
name: 'GitHub Readme',
url: 'https://github.com/puppeteer/puppeteer',
category: 'technical'
},
{
name: 'TechHub',
url: 'https://techhub.iodigital.com/articles/what-is-webllm?ref=dailydev',
category: 'technical'
},
// Documentation Sites
{
name: 'MDN Web Docs',
url: 'https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map',
category: 'documentation'
},
{
name: 'Unsloth Docs',
url: 'https://docs.unsloth.ai/',
category: 'documentation'
},
{
name: 'Cursor Docs',
url: 'https://docs.cursor.com/get-started/welcome',
category: 'documentation'
},
// Wiki Sites
{
name: 'Wikipedia',
url: 'https://en.wikipedia.org/wiki/Artificial_intelligence',
category: 'wiki'
},
{
name: 'WikiHow',
url: 'https://www.wikihow.com/Make-a-Tree-Seat',
category: 'wiki'
},
{
name: 'WikiBooks',
url: 'https://en.wikibooks.org/wiki/PSP_Programming/Text_Menu',
category: 'wiki'
},
// Product Sites
{
name: 'macys',
url: 'https://www.macys.com/shop/product/franco-sarto-womens-marlina-fisherman-pointed-toe-kitten-heel-mules?ID=19490453',
category: 'product'
},
{
name: 'Amazon Product',
url: 'https://www.amazon.com/Paris-Hydrating-Dehydrated-Hyaluronic-Paraben-Free/dp/B0BCR23QDG',
category: 'ecommerce'
},
// Travel Sites
{
name: 'Lonely Planet',
url: 'https://www.lonelyplanet.com/articles/best-places-to-visit-in-japan',
category: 'travel'
},
{
name: 'TripAdvisor',
url: 'https://www.tripadvisor.com/Tourism-g60763-New_York_City_New_York-Vacations.html',
category: 'travel'
},
// Social Media Sites
{
name: 'CigarScanner',
url: 'https://www.cigarscanner.com/tabs/social/users/ebff7911-0be1-4e84-8211-ce7930a508d2',
category: 'social'
},
{
name: 'Instagram',
url: 'https://www.instagram.com/neogurumi/',
category: 'social'
},
// Government and Legal Sites
{
name: 'White House',
url: 'https://www.whitehouse.gov/',
category: 'government'
},
{
name: 'New Hampshire',
url: 'https://gc.nh.gov/rsa/html/LV/540/540-mrg.htm',
category: 'government'
},
{
name: 'Alabama Law',
url: 'https://law.justia.com/codes/alabama/title-28/chapter-1/section-28-1-5/',
category: 'legal'
},
// Education Sites
{
name: 'Coursera',
url: 'https://www.coursera.org/degrees/ms-computer-science-boulder',
category: 'education'
},
{
name: 'Rasmussen',
url: 'https://www.rasmussen.edu/student-experience/college-life/15-educational-search-engines/',
category: 'education'
},
{
name: 'Harvard',
url: 'https://news.harvard.edu/gazette/story/2024/08/on-move-in-day-hugs-sweat-and-tears/',
category: 'education'
},
// Entertainment Sites
{
name: 'IMDB',
url: 'https://www.imdb.com/title/tt0117500/',
category: 'entertainment'
},
{
name: 'Rotten Tomatoes',
url: 'https://www.rottentomatoes.com/m/the_dark_knight',
category: 'entertainment'
},
{
name: 'Metacritic',
url: 'https://www.metacritic.com/game/pc/the-witcher-3-wild-hunt',
category: 'entertainment'
},
];
/**
* Get the list of sites to test based on options
*/
function getSitesToTest(options) {
let sitesToTest = [];
switch (options.mode) {
case 'category':
sitesToTest = ALL_TEST_SITES.filter(site => site.category === options.target);
break;
case 'name':
sitesToTest = ALL_TEST_SITES.filter(site =>
site.name.toLowerCase() === options.target.toLowerCase());
break;
case 'all':
default:
sitesToTest = ALL_TEST_SITES;
break;
}
// Apply max sites limit
if (options.maxSites < sitesToTest.length) {
sitesToTest = sitesToTest.slice(0, options.maxSites);
}
return sitesToTest;
}
/**
* Run tests for the selected sites
*/
async function runTests(options) {
const sitesToTest = getSitesToTest(options);
if (sitesToTest.length === 0) {
console.log(chalk.redBright('❌ No sites found matching your criteria'));
process.exit(1);
}
console.log(chalk.blueBright(`
🔍 Running tests for ${chalk.whiteBright(sitesToTest.length)} sites
${options.scrapeImages ? '📸 Image scraping: ' + chalk.greenBright('enabled') : '📸 Image scraping: ' + chalk.redBright('disabled')}
${options.downloadImages ? '📥 Image downloading: ' + chalk.greenBright('enabled') : '📥 Image downloading: ' + chalk.redBright('disabled')}
${chalk.yellowBright('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━')}
`));
// Create output directory
try {
await fs.mkdir(options.outputDir, { recursive: true });
} catch (error) {
console.error(chalk.redBright(`❌ Error creating output directory: ${error.message}`));
process.exit(1);
}
// Create timestamp for this test run
const timestamp = new Date().toISOString().replace(/:/g, '-');
const runDir = path.join(options.outputDir, `run_${timestamp}`);
await fs.mkdir(runDir, { recursive: true });
// Create a summary file
const summaryData = {
timestamp,
options,
sites: sitesToTest.map(site => ({ name: site.name, url: site.url, category: site.category })),
results: []
};
// Test each site
for (let i = 0; i < sitesToTest.length; i++) {
const site = sitesToTest[i];
const siteNumber = i + 1;
console.log(chalk.cyanBright(`
🌐 [${siteNumber}/${sitesToTest.length}] Testing ${chalk.whiteBright(site.name)} (${chalk.yellowBright(site.category)})
${chalk.whiteBright(site.url)}
`));
const startTime = Date.now();
try {
// Create options for the scraper
const scraperOptions = {
maxScrolls: 20,
scrollDelay: 1000,
headless: true,
scrapeImages: options.scrapeImages,
downloadImages: options.downloadImages,
output: runDir
};
// Run the scraper (using scrape instead of mainScraper)
const result = await scrape(site.url, scraperOptions);
const endTime = Date.now();
const duration = ((endTime - startTime) / 1000).toFixed(2);
console.log(chalk.greenBright(`
✅ ${chalk.whiteBright(site.name)} completed in ${chalk.yellowBright(duration + 's')}
📝 Content items: ${chalk.whiteBright(result.content?.length || 0)}
📸 Images: ${chalk.whiteBright(result.images?.length || 0)}
`));
// Add to summary
summaryData.results.push({
name: site.name,
url: site.url,
category: site.category,
success: true,
duration: parseFloat(duration),
contentCount: result.content?.length || 0,
imageCount: result.images?.length || 0
});
} catch (error) {
console.error(chalk.redBright(`
❌ Error testing ${chalk.whiteBright(site.name)}:
${error.message}
`));
// Add failure to summary
summaryData.results.push({
name: site.name,
url: site.url,
category: site.category,
success: false,
error: error.message
});
}
}
// Save summary file
const summaryFile = path.join(runDir, 'summary.json');
await fs.writeFile(summaryFile, JSON.stringify(summaryData, null, 2));
// Display test summary
const successCount = summaryData.results.filter(r => r.success).length;
const failureCount = summaryData.results.length - successCount;
console.log(chalk.cyanBright(`
📊 Test Summary
${chalk.yellowBright('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━')}
✅ Success: ${chalk.greenBright(successCount)}
❌ Failure: ${chalk.redBright(failureCount)}
⏱️ Total sites tested: ${chalk.whiteBright(summaryData.results.length)}
📁 Results saved to: ${chalk.whiteBright(runDir)}
${chalk.yellowBright('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━')}
`));
}
/**
* Main function
*/
async function main() {
displayBanner();
const options = parseArgs();
if (options.showHelp) {
showHelp();
return;
}
await runTests(options);
}
// Run the script
main().catch(error => {
console.error(chalk.redBright(`
❌ Fatal error:
${error.message}
${error.stack}
`));
process.exit(1);
});