UNPKG

gpt-research

Version:

Autonomous AI research agent that conducts comprehensive research on any topic and generates detailed reports with citations

329 lines 12.6 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.PuppeteerScraper = void 0; const puppeteer_core_1 = __importDefault(require("puppeteer-core")); const chromium_1 = __importDefault(require("@sparticuz/chromium")); const readability_1 = require("@mozilla/readability"); const jsdom_1 = require("jsdom"); const BaseScraper_1 = require("./BaseScraper"); class PuppeteerScraper extends BaseScraper_1.BaseScraper { config; browser = null; constructor(name = 'Puppeteer', config) { super(name, config); // Detect if running on Vercel const isVercel = process.env.VERCEL === '1' || process.env.NOW_REGION !== undefined; this.config = { headless: true, useReadability: true, waitUntil: 'networkidle2', scrollToBottom: true, extractImages: true, viewport: { width: 1920, height: 1080 }, isVercel, ...config }; } async scrape(url) { let page = null; try { if (!this.validateUrl(url)) { throw new Error('Invalid URL'); } this.emit('scraping_start', { scraper: this.name, url }); // Launch browser if not already launched if (!this.browser) { await this.launchBrowser(); } // Create new page page = await this.browser.newPage(); // Set viewport if (this.config.viewport) { await page.setViewport(this.config.viewport); } // Set user agent if (this.config.userAgent) { await page.setUserAgent(this.config.userAgent); } // Block unnecessary resources to speed up loading if (this.config.blockResources && this.config.blockResources.length > 0) { await page.setRequestInterception(true); page.on('request', (request) => { if (this.config.blockResources.includes(request.resourceType())) { request.abort(); } else { request.continue(); } }); } // Navigate to the page await page.goto(url, { waitUntil: this.config.waitUntil, timeout: this.config.timeout }); // Wait for specific selector if provided if (this.config.waitForSelector) { await page.waitForSelector(this.config.waitForSelector, { timeout: 5000 }).catch(() => { this.emit('warning', { message: `Selector ${this.config.waitForSelector} not found, continuing anyway` }); }); } // Scroll to bottom to trigger lazy loading if (this.config.scrollToBottom) { await this.autoScroll(page); } // Get the page content const html = await page.content(); // Extract content let result; if (this.config.useReadability) { result = await this.extractWithReadability(page, url, html); } else { result = await this.extractWithPuppeteer(page, url); } this.emit('scraping_complete', { scraper: this.name, url, method: this.config.useReadability ? 'readability' : 'puppeteer' }); return result; } catch (error) { return this.handleError(error, url); } finally { // Close the page if (page) { await page.close().catch(() => { }); } } } async launchBrowser() { const options = { headless: this.config.headless, args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--no-first-run', '--no-zygote', '--single-process', // For serverless environments '--disable-gpu' ] }; // Configure for Vercel/serverless environment if (this.config.isVercel) { options.args = [...(chromium_1.default.args || []), ...(options.args || [])]; options.defaultViewport = chromium_1.default.defaultViewport; options.executablePath = await chromium_1.default.executablePath(); options.headless = chromium_1.default.headless; } else if (this.config.executablePath) { options.executablePath = this.config.executablePath; } else { // Try to use system Chrome/Chromium // Try to import regular puppeteer (not installed in this setup) // const puppeteerDefault = (await import('puppeteer')).default; const puppeteerDefault = null; // Use puppeteer-core only if (puppeteerDefault) { this.browser = await puppeteerDefault.launch(options); } else { throw new Error('Puppeteer not available'); } return; } this.browser = await puppeteer_core_1.default.launch(options); } async autoScroll(page) { await page.evaluate(async () => { await new Promise((resolve) => { let totalHeight = 0; const distance = 100; const timer = setInterval(() => { const scrollHeight = document.body.scrollHeight; window.scrollBy(0, distance); totalHeight += distance; if (totalHeight >= scrollHeight) { clearInterval(timer); resolve(); } }, 100); // Maximum scroll time: 10 seconds setTimeout(() => { clearInterval(timer); resolve(); }, 10000); }); }); } async extractWithReadability(page, url, html) { try { const dom = new jsdom_1.JSDOM(html, { url }); const reader = new readability_1.Readability(dom.window.document); const article = reader.parse(); if (!article) { // Fallback to Puppeteer extraction return this.extractWithPuppeteer(page, url); } // Extract images if enabled const images = this.config.extractImages ? await this.extractImages(page) : []; return { url, title: article.title, content: this.cleanText(article.textContent || ''), markdown: this.htmlToMarkdown(article.content || ''), images, metadata: { byline: article.byline, excerpt: article.excerpt, length: article.length, siteName: article.siteName } }; } catch (error) { this.emit('warning', { message: 'Readability extraction failed, falling back to Puppeteer', error: error instanceof Error ? error.message : String(error) }); return this.extractWithPuppeteer(page, url); } } async extractWithPuppeteer(page, url) { // Extract data using Puppeteer's evaluation const data = await page.evaluate(() => { // Helper function to get text content const getText = (selector) => { const element = document.querySelector(selector); return element?.textContent?.trim() || ''; }; // Extract title const title = document.querySelector('meta[property="og:title"]')?.getAttribute('content') || document.querySelector('meta[name="twitter:title"]')?.getAttribute('content') || document.title || getText('h1') || 'Untitled'; // Extract main content const contentSelectors = [ 'main', 'article', '[role="main"]', '#main', '#content', '.content', '.post', '.article' ]; let content = ''; for (const selector of contentSelectors) { const element = document.querySelector(selector); if (element) { content = element.textContent || ''; break; } } if (!content) { // Fallback: get body text const body = document.body.cloneNode(true); // Remove unwanted elements const unwanted = body.querySelectorAll('script, style, nav, header, footer'); unwanted.forEach(el => el.remove()); content = body.textContent || ''; } // Extract metadata const metadata = {}; const metaTags = document.querySelectorAll('meta[name], meta[property]'); metaTags.forEach(tag => { const name = tag.getAttribute('name') || tag.getAttribute('property'); const content = tag.getAttribute('content'); if (name && content) { metadata[name] = content; } }); // Extract structured data const jsonLdScripts = document.querySelectorAll('script[type="application/ld+json"]'); if (jsonLdScripts.length > 0) { metadata.structuredData = []; jsonLdScripts.forEach(script => { try { metadata.structuredData.push(JSON.parse(script.textContent || '{}')); } catch { // Ignore parsing errors } }); } return { title, content, metadata }; }); // Extract images if enabled const images = this.config.extractImages ? await this.extractImages(page) : []; return { url, title: data.title, content: this.cleanText(data.content), markdown: this.htmlToMarkdown(data.content), images, metadata: data.metadata }; } async extractImages(page) { return page.evaluate(() => { const images = []; const seen = new Set(); // Get all img elements document.querySelectorAll('img').forEach(img => { const src = img.src || img.getAttribute('data-src') || img.getAttribute('data-lazy-src'); if (src && !seen.has(src)) { seen.add(src); images.push(src); } }); // Get og:image const ogImage = document.querySelector('meta[property="og:image"]')?.getAttribute('content'); if (ogImage && !seen.has(ogImage)) { images.unshift(ogImage); // Add to beginning } return images; }); } async close() { if (this.browser) { await this.browser.close(); this.browser = null; } } // Override scrapeMultiple to reuse browser async scrapeMultiple(urls, concurrency = 3) { try { // Launch browser once for all URLs if (!this.browser) { await this.launchBrowser(); } // Use parent's scrapeMultiple which will reuse the browser return super.scrapeMultiple(urls, concurrency); } finally { // Close browser after all scraping is done await this.close(); } } } exports.PuppeteerScraper = PuppeteerScraper; //# sourceMappingURL=PuppeteerScraper.js.map