UNPKG

scrapegraph-js

Version:

Scrape and extract structured data from a webpage using ScrapeGraphAI's APIs. Supports cookies for authentication, infinite scrolling, and pagination.

262 lines (220 loc) 7.79 kB
/** * Comprehensive example demonstrating cookies integration for web scraping. * * This example shows various real-world scenarios where cookies are essential: * 1. E-commerce site scraping with authentication * 2. Social media scraping with session cookies * 3. Banking/financial site scraping with secure cookies * 4. News site scraping with user preferences * 5. API endpoint scraping with authentication tokens * * Requirements: * - Node.js 16+ * - scrapegraph-js * - A .env file with your SGAI_APIKEY * * Example .env file: * SGAI_APIKEY=your_api_key_here */ import { smartScraper } from 'scrapegraph-js'; import { z } from 'zod'; import 'dotenv/config'; // Define data schemas for different scenarios const ProductInfoSchema = z.object({ name: z.string().describe('Product name'), price: z.string().describe('Product price'), availability: z.string().describe('Product availability status'), rating: z.string().optional().describe('Product rating') }); const SocialMediaPostSchema = z.object({ author: z.string().describe('Post author'), content: z.string().describe('Post content'), likes: z.string().optional().describe('Number of likes'), comments: z.string().optional().describe('Number of comments'), timestamp: z.string().optional().describe('Post timestamp') }); const NewsArticleSchema = z.object({ title: z.string().describe('Article title'), summary: z.string().describe('Article summary'), author: z.string().optional().describe('Article author'), publish_date: z.string().optional().describe('Publish date') }); const BankTransactionSchema = z.object({ date: z.string().describe('Transaction date'), description: z.string().describe('Transaction description'), amount: z.string().describe('Transaction amount'), type: z.string().describe('Transaction type (credit/debit)') }); async function scrapeEcommerceWithAuth() { console.log('='.repeat(60)); console.log('E-COMMERCE SITE SCRAPING WITH AUTHENTICATION'); console.log('='.repeat(60)); // Example cookies for an e-commerce site const cookies = { session_id: 'abc123def456', user_id: 'user789', cart_id: 'cart101112', preferences: 'dark_mode,usd', auth_token: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...' }; const websiteUrl = 'https://example-ecommerce.com/products'; const userPrompt = 'Extract product information including name, price, availability, and rating'; try { const response = await smartScraper( process.env.SGAI_APIKEY, websiteUrl, userPrompt, ProductInfoSchema, 5, // numberOfScrolls - Scroll to load more products null, // totalPages cookies ); console.log('✅ E-commerce scraping completed successfully'); console.log(JSON.stringify(response, null, 2)); } catch (error) { console.error(`❌ Error in e-commerce scraping: ${error.message}`); } } async function scrapeSocialMediaWithSession() { console.log('\n' + '='.repeat(60)); console.log('SOCIAL MEDIA SCRAPING WITH SESSION COOKIES'); console.log('='.repeat(60)); // Example cookies for a social media site const cookies = { session_token: 'xyz789abc123', user_session: 'def456ghi789', csrf_token: 'jkl012mno345', remember_me: 'true', language: 'en_US' }; const websiteUrl = 'https://example-social.com/feed'; const userPrompt = 'Extract posts from the feed including author, content, likes, and comments'; try { const response = await smartScraper( process.env.SGAI_APIKEY, websiteUrl, userPrompt, SocialMediaPostSchema, 10, // numberOfScrolls - Scroll to load more posts null, // totalPages cookies ); console.log('✅ Social media scraping completed successfully'); console.log(JSON.stringify(response, null, 2)); } catch (error) { console.error(`❌ Error in social media scraping: ${error.message}`); } } async function scrapeNewsWithPreferences() { console.log('\n' + '='.repeat(60)); console.log('NEWS SITE SCRAPING WITH USER PREFERENCES'); console.log('='.repeat(60)); // Example cookies for a news site const cookies = { user_preferences: 'technology,science,ai', reading_level: 'advanced', region: 'US', subscription_tier: 'premium', theme: 'dark' }; const websiteUrl = 'https://example-news.com/technology'; const userPrompt = 'Extract news articles including title, summary, author, and publish date'; try { const response = await smartScraper( process.env.SGAI_APIKEY, websiteUrl, userPrompt, NewsArticleSchema, null, // numberOfScrolls 3, // totalPages - Scrape multiple pages cookies ); console.log('✅ News scraping completed successfully'); console.log(JSON.stringify(response, null, 2)); } catch (error) { console.error(`❌ Error in news scraping: ${error.message}`); } } async function scrapeBankingWithSecureCookies() { console.log('\n' + '='.repeat(60)); console.log('BANKING SITE SCRAPING WITH SECURE COOKIES'); console.log('='.repeat(60)); // Example secure cookies for a banking site const cookies = { secure_session: 'pqr678stu901', auth_token: 'vwx234yz567', mfa_verified: 'true', device_id: 'device_abc123', last_activity: '2024-01-15T10:30:00Z' }; const websiteUrl = 'https://example-bank.com/transactions'; const userPrompt = 'Extract recent transactions including date, description, amount, and type'; try { const response = await smartScraper( process.env.SGAI_APIKEY, websiteUrl, userPrompt, BankTransactionSchema, null, // numberOfScrolls 5, // totalPages - Scrape multiple pages of transactions cookies ); console.log('✅ Banking scraping completed successfully'); console.log(JSON.stringify(response, null, 2)); } catch (error) { console.error(`❌ Error in banking scraping: ${error.message}`); } } async function scrapeApiWithAuthTokens() { console.log('\n' + '='.repeat(60)); console.log('API ENDPOINT SCRAPING WITH AUTH TOKENS'); console.log('='.repeat(60)); // Example API authentication cookies const cookies = { api_token: 'api_abc123def456', client_id: 'client_789', access_token: 'access_xyz789', refresh_token: 'refresh_abc123', scope: 'read:all' }; const websiteUrl = 'https://api.example.com/data'; const userPrompt = 'Extract data from the API response'; try { const response = await smartScraper( process.env.SGAI_APIKEY, websiteUrl, userPrompt, null, // No schema for generic API response null, // numberOfScrolls null, // totalPages cookies ); console.log('✅ API scraping completed successfully'); console.log(JSON.stringify(response, null, 2)); } catch (error) { console.error(`❌ Error in API scraping: ${error.message}`); } } async function main() { const apiKey = process.env.SGAI_APIKEY; // Check if API key is available if (!apiKey) { console.error('Error: SGAI_APIKEY not found in .env file'); console.log('Please create a .env file with your API key:'); console.log('SGAI_APIKEY=your_api_key_here'); return; } console.log('🍪 COOKIES INTEGRATION EXAMPLES'); console.log('This demonstrates various real-world scenarios where cookies are essential for web scraping.'); // Run all examples await scrapeEcommerceWithAuth(); await scrapeSocialMediaWithSession(); await scrapeNewsWithPreferences(); await scrapeBankingWithSecureCookies(); await scrapeApiWithAuthTokens(); console.log('\n' + '='.repeat(60)); console.log('✅ All examples completed!'); console.log('='.repeat(60)); } // Run the example main().catch(console.error);