UNPKG

@pinkpixel/prysm-mcp

Version:

MCP server for the Prysm web scraper - enabling AI assistants to scrape web content

1,425 lines (1,246 loc) 85 kB
/** * MainExtractor - Content extraction focused on trying all possible methods * * This class implements multiple extraction methods and tries all of them * on every page to maximize content extraction. */ // Import default options from utils const { DEFAULT_OPTIONS } = require('./defaultOptions'); // Import chalk for colored output const chalk = require('chalk'); // Create helper function for consistent delays const wait = (ms) => new Promise(resolve => setTimeout(resolve, ms)); // Helper function to sanitize CSS selectors const sanitizeSelector = (selector) => { if (!selector) return ''; // Replace invalid characters and escape special characters return selector .replace(/:/g, '\\:') .replace(/\./g, '.') .replace(/\//g, '\\/') .replace(/\+/g, '\\+') .replace(/\[/g, '\\[') .replace(/\]/g, '\\]') .replace(/\(/g, '\\(') .replace(/\)/g, '\\)') .replace(/!/g, '\\!') .replace(/@/g, '\\@') .replace(/,/g, '\\,') .replace(/~/g, '\\~') .trim(); }; class MainExtractor { /** * Create a new MainExtractor instance * @param {Object} page - Puppeteer page instance * @param {Object} options - Options for extraction */ constructor(page, options = {}) { this.page = page; this.data = { url: '', title: '', content: [], images: [], metadata: {} }; // Apply analysis results if available this.analysisResult = options.analysisResult || null; // Configure extractor options from analysis this.priorityExtractors = []; this.skipExtractors = []; // Track the successful extractor to optimize subsequent extraction after pagination this.successfulExtractor = null; // Add compatibility for different Puppeteer versions this.ensureCompatibility(); } /** * Ensure compatibility with different Puppeteer versions */ ensureCompatibility() { try { // Try to add waitForFunction if it doesn't exist if (!this.page.waitForFunction) { this.page.waitForFunction = async (pageFunction, options = {}, ...args) => { return await this.page.evaluate(pageFunction, ...args); }; } } catch (error) { // Silently continue if we can't add compatibility } } /** * Safe wait function that works with all Puppeteer versions */ async safeWait(ms) { try { if (this.page.waitForTimeout) { await this.page.waitForTimeout(ms); } else { await new Promise(resolve => setTimeout(resolve, ms)); } } catch (error) { // Fallback to setTimeout if there's an error await new Promise(resolve => setTimeout(resolve, ms)); } } /** * Determines if an extractor should be run based on Smart Scan analysis * @param {string} extractorName - Name of the extractor method * @returns {boolean} Whether the extractor should run */ shouldRunExtractor(extractorName) { // Skip if explicitly included in skipExtractors if (this.skipExtractors.includes(extractorName)) { return false; } // If there are priority extractors defined, only run those if (this.priorityExtractors.length > 0) { return this.priorityExtractors.includes(extractorName); } // Otherwise run all extractors return true; } /** * Check if extraction is already sufficient * @param {Array} content - The extracted content * @returns {boolean} True if content is sufficient * @private */ _isExtractionSufficient(content) { if (!content || !Array.isArray(content)) return false; // Filter out empty items const validContent = content.filter(item => { if (typeof item === 'string') return item.trim().length > 0; if (typeof item === 'object' && item.text) return item.text.trim().length > 0; return false; }); // Check content count if (validContent.length < 2) return false; // Calculate total text length const totalLength = validContent.reduce((sum, item) => { const text = typeof item === 'string' ? item : item.text || ''; return sum + text.length; }, 0); // Check if we have at least one heading/title const hasHeading = validContent.some(item => { if (typeof item === 'object' && item.tag) { return ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title'].includes(item.tag); } return false; }); // Check if this is a minimal site (like example.com) const isMinimalSite = this.page.url().includes('example.com') || totalLength < 500 && validContent.length < 5; // For minimal sites, use much lower thresholds if (isMinimalSite) { return hasHeading && validContent.length >= 2; } // Consider extraction sufficient if we have: // 1. Reasonable amount of content (>1000 chars) // 2. At least one heading // 3. At least 5 content items return (totalLength > 1000 && hasHeading && validContent.length >= 5); } /** * Main extraction method that tries all extraction approaches */ async extract() { try { let newContent = []; // Extract title if not already set if (!this.data.title) { this.data.title = await this.extractTitle(); } // If we already have a successful extractor from previous extraction, // only use that one to avoid redundant processing if (this.successfulExtractor) { try { let extractedContent = []; switch (this.successfulExtractor) { case 'recipes': extractedContent = await this.extractFromRecipes(); break; case 'article': extractedContent = await this.extractFromArticle(); break; case 'mainContent': extractedContent = await this.extractFromMainContent(); break; case 'semantic': extractedContent = await this.extractFromSemantic(); break; case 'headerContentFooter': extractedContent = await this.extractFromHeaderContentFooter(); break; case 'multiColumn': extractedContent = await this.extractFromMultiColumn(); break; case 'contentSections': extractedContent = await this.extractFromContentSections(); break; case 'singleColumn': extractedContent = await this.extractFromSingleColumn(); break; case 'largestContent': extractedContent = await this.extractFromLargest(); break; case 'product': extractedContent = await this.extractFromProduct(); break; case 'documentation': extractedContent = await this.extractFromDocumentation(); break; case 'basic': extractedContent = await this.extractFromBasic(); break; case 'textDensity': extractedContent = await this.extractFromTextDensity(); break; } if (extractedContent && extractedContent.length > 0) { newContent.push(...extractedContent); // Add to this.data.content immediately to check if it's sufficient const uniqueContent = extractedContent.filter(item => { if (!item) return false; const text = typeof item === 'string' ? item : item.text; if (!text) return false; return !this.data.content.some(existing => { const existingText = typeof existing === 'string' ? existing : existing.text; return existingText === text; }); }); this.data.content.push(...uniqueContent); } } catch (error) { // If the successful extractor fails, fall back to trying all extractors this.successfulExtractor = null; } } // If no successful extractor yet, or if it failed, try all extractors if (!this.successfulExtractor) { // Sort extractors by priority if provided const sortedExtractors = this._getSortedExtractors(); // Try each extractor until we get sufficient content for (const extractorName of sortedExtractors) { if (this._isExtractionSufficient(this.data.content)) { // We already have good content, skip remaining extractors break; } try { let extractedContent = []; switch (extractorName) { case 'recipes': extractedContent = await this.extractFromRecipes(); break; case 'article': extractedContent = await this.extractFromArticle(); break; case 'mainContent': extractedContent = await this.extractFromMainContent(); break; case 'semantic': extractedContent = await this.extractFromSemantic(); break; case 'headerContentFooter': extractedContent = await this.extractFromHeaderContentFooter(); break; case 'multiColumn': extractedContent = await this.extractFromMultiColumn(); break; case 'contentSections': extractedContent = await this.extractFromContentSections(); break; case 'singleColumn': extractedContent = await this.extractFromSingleColumn(); break; case 'largestContent': extractedContent = await this.extractFromLargest(); break; case 'product': extractedContent = await this.extractFromProduct(); break; case 'documentation': extractedContent = await this.extractFromDocumentation(); break; case 'basic': extractedContent = await this.extractFromBasic(); break; case 'textDensity': extractedContent = await this.extractFromTextDensity(); break; } if (extractedContent && extractedContent.length > 0) { newContent.push(...extractedContent); // Add to this.data.content immediately to check if it's sufficient const uniqueContent = extractedContent.filter(item => { if (!item) return false; const text = typeof item === 'string' ? item : item.text; if (!text) return false; return !this.data.content.some(existing => { const existingText = typeof existing === 'string' ? existing : existing.text; return existingText === text; }); }); this.data.content.push(...uniqueContent); // Check if we now have sufficient content if (this._isExtractionSufficient(this.data.content)) { // Set a flag to indicate we found an optimal method this.foundOptimalMethod = true; this.optimalMethod = extractorName; this.successfulExtractor = extractorName; // Store the successful extractor for future use break; } } } catch (error) { // Continue to next extractor if one fails } } } // Always extract images await this.extractImages(); // Always extract metadata await this.extractMetadata(); return this.data; } catch (error) { return this.data; } } /** * Simple extraction for basic sites (much faster) * @private */ async _simpleExtraction() { // Get basic page info await this.extractTitle(); await this.extractMetadata(); // Extract visible text content with minimal processing const content = await this.page.evaluate(() => { // Simple function to get visible text from the page function getVisibleText(element, depth = 0) { if (!element) return []; if (depth > 5) return []; // Limit recursion // Skip hidden elements const style = window.getComputedStyle(element); if (style.display === 'none' || style.visibility === 'hidden') { return []; } // Get text content if this is a text node if (element.nodeType === Node.TEXT_NODE) { const text = element.textContent.trim(); return text ? [{ text, tag: 'p' }] : []; } // Skip script, style, and certain other elements const tagName = element.tagName?.toLowerCase(); if (['script', 'style', 'noscript', 'svg', 'iframe'].includes(tagName)) { return []; } // For headings, get text with appropriate tag if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName)) { const text = element.textContent.trim(); return text ? [{ text, tag: tagName, important: true }] : []; } // For paragraphs or divs with text if (tagName === 'p' || (tagName === 'div' && element.childNodes.length <= 3)) { const text = element.textContent.trim(); if (text && text.length > 10) { return [{ text, tag: tagName }]; } } // Recursively process child nodes let results = []; for (const child of element.childNodes) { results = results.concat(getVisibleText(child, depth + 1)); } return results; } // Main content usually in these elements const mainElements = [ document.querySelector('main'), document.querySelector('article'), document.querySelector('#content'), document.querySelector('.content'), document.querySelector('#main'), document.querySelector('.main'), document.body // Fallback ].filter(Boolean); // Get text from main content first let allContent = []; for (const element of mainElements) { const content = getVisibleText(element); if (content.length > 0) { allContent = allContent.concat(content); // If we found good content, we can stop if (content.length > 5) break; } } // Images const images = Array.from(document.querySelectorAll('img')) .filter(img => { const src = img.src; const width = img.width || 0; const height = img.height || 0; return src && src.startsWith('http') && width > 100 && height > 100; }) .map(img => ({ url: img.src, alt: img.alt || '', width: img.width || 0, height: img.height || 0 })); return { content: allContent, images }; }); // Add the extracted content and images to our data if (content.content && content.content.length > 0) { this.data.content = this.data.content.concat(content.content); } if (content.images && content.images.length > 0) { this.data.images = this.data.images.concat(content.images); } // Deduplicate this._deduplicateContent(); } /** * Extracts content from article elements */ async extractFromArticle() { try { const articleContent = await this.page.evaluate(() => { const results = []; const articles = document.querySelectorAll('article'); for (const article of articles) { // Get all paragraphs, headings, and lists within the article const elements = article.querySelectorAll('p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote'); for (const elem of elements) { const text = elem.textContent.trim(); if (text.length > 0) { results.push(text); } } // If no structured elements found, get the raw text if (results.length === 0 && article.textContent.trim().length > 0) { results.push(article.textContent.trim()); } } return results; }); return articleContent; } catch (error) { return []; } } /** * Extracts content from main content elements */ async extractFromMainContent() { try { const mainContent = await this.page.evaluate(() => { const results = []; const mainElements = document.querySelectorAll('main, [role="main"], #main, .main, .content, #content, .post-content, .article-content'); for (const main of mainElements) { // Get all paragraphs, headings, and lists within the main content const elements = main.querySelectorAll('p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote'); for (const elem of elements) { const text = elem.textContent.trim(); if (text.length > 0) { results.push(text); } } // If no structured elements found, get the raw text if (results.length === 0 && main.textContent.trim().length > 0) { results.push(main.textContent.trim()); } } return results; }); return mainContent; } catch (error) { return []; } } /** * Extracts content from header-content-footer structure */ async extractFromHeaderContentFooter() { try { const content = await this.page.evaluate(() => { const results = []; const header = document.querySelector('header'); const footer = document.querySelector('footer'); if (!header || !footer) return results; // Function to get next sibling elements function getNextSiblings(elem, filter) { const siblings = []; while (elem && elem !== footer) { elem = elem.nextElementSibling; if (elem && elem !== footer && (!filter || filter(elem))) { siblings.push(elem); } } return siblings; } // Get all elements between header and footer const contentElements = getNextSiblings(header, elem => { // Skip empty elements or navigation return elem.textContent.trim().length > 0 && !elem.matches('nav, aside, .sidebar, .ad, .advertisement'); }); for (const elem of contentElements) { // Get all text-containing elements const textElements = elem.querySelectorAll('p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote'); for (const textElem of textElements) { const text = textElem.textContent.trim(); if (text.length > 0) { results.push(text); } } // If no text elements found, get the raw text if (textElements.length === 0 && elem.textContent.trim().length > 0) { results.push(elem.textContent.trim()); } } return results; }); return content; } catch (error) { return []; } } /** * Extracts content from multi-column layouts */ async extractFromMultiColumn() { try { const content = await this.page.evaluate(() => { const results = []; const columns = document.querySelectorAll('.column, .col, [class*="col-"], [class*="column-"]'); // Find the main content column (usually the largest one with most text) let mainColumn = null; let maxTextLength = 0; for (const column of columns) { const textLength = column.textContent.trim().length; if (textLength > maxTextLength) { maxTextLength = textLength; mainColumn = column; } } if (mainColumn) { // Get all paragraphs, headings, and lists within the main column const elements = mainColumn.querySelectorAll('p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote'); for (const elem of elements) { const text = elem.textContent.trim(); if (text.length > 0) { results.push(text); } } // If no structured elements found, get the raw text if (results.length === 0 && mainColumn.textContent.trim().length > 0) { results.push(mainColumn.textContent.trim()); } } return results; }); return content; } catch (error) { return []; } } /** * Extracts content from the largest content block on the page * This is a fallback method when other methods fail */ async extractFromLargest() { try { const content = await this.page.evaluate(() => { const results = []; // Find elements with substantial text content const allElements = document.querySelectorAll('div, section, main, article'); let bestElement = null; let maxTextLength = 0; let maxParagraphs = 0; for (const elem of allElements) { // Skip hidden elements, navigation, and other non-content elements if ( !elem.offsetParent || // Hidden element elem.matches('nav, header, footer, aside, .sidebar, .ad, .advertisement, .menu') || elem.id && /nav|menu|sidebar|footer|header/i.test(elem.id) || elem.className && /nav|menu|sidebar|footer|header/i.test(elem.className) ) { continue; } const paragraphs = elem.querySelectorAll('p'); const textLength = elem.textContent.trim().length; // Prioritize elements with more paragraphs and more text if ( (paragraphs.length > maxParagraphs) || (paragraphs.length === maxParagraphs && textLength > maxTextLength) ) { maxParagraphs = paragraphs.length; maxTextLength = textLength; bestElement = elem; } } if (bestElement) { // Get all paragraphs, headings, and lists within the best element const elements = bestElement.querySelectorAll('p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote'); for (const elem of elements) { const text = elem.textContent.trim(); if (text.length > 0) { results.push(text); } } // If no structured elements found, get the raw text if (results.length === 0 && bestElement.textContent.trim().length > 0) { results.push(bestElement.textContent.trim()); } } return results; }); return content; } catch (error) { return []; } } /** * Extracts content from elements with semantic meaning * Looks for elements with semantic roles or schema.org attributes */ async extractFromSemantic() { try { const semanticContent = await this.page.evaluate(() => { const results = []; // Look for elements with semantic attributes const semanticSelectors = [ // ARIA roles related to content '[role="article"]', '[role="main"]', '[role="contentinfo"]', '[role="document"]', '[role="region"]', // Schema.org attributes '[itemtype*="Article"]', '[itemtype*="NewsArticle"]', '[itemtype*="BlogPosting"]', '[itemtype*="WebPage"]', '[itemtype*="CreativeWork"]', // HTML5 semantic elements not covered by other methods 'article', 'section', // OpenGraph marked content '[property="og:description"]', // Common content classes without being overly specific '.post-content', '.entry-content', '.article-content', '.blog-content', '.story-content', '.page-content' ]; // Try each semantic selector for (const selector of semanticSelectors) { try { const elements = document.querySelectorAll(selector); for (const element of elements) { // Skip elements with no content or too small content if (element.textContent.trim().length < 100) continue; // Skip hidden elements if (!element.offsetParent) continue; // Skip navigation, sidebars, etc. if ( element.matches('nav, aside, header, footer') || element.id && /nav|menu|sidebar|header|footer/i.test(element.id) || element.className && /nav|menu|sidebar|header|footer/i.test(element.className) ) { continue; } // Get all text elements within this semantic element const textElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, li, blockquote, figcaption, code, pre'); if (textElements.length > 0) { for (const textElem of textElements) { const text = textElem.textContent.trim(); if (text.length > 0) { results.push(text); } } } else if (element.textContent.trim().length > 150) { // If no structured elements but has substantial text results.push(element.textContent.trim()); } } } catch (e) { // Skip this selector if there's an error continue; } } return results; }); return semanticContent; } catch (error) { return []; } } /** * Extract content from recipe pages */ async extractFromRecipes() { try { const recipeData = await this.page.evaluate(() => { const content = []; // Try to extract title const titleElement = document.querySelector('h1'); if (titleElement) { content.push(titleElement.textContent.trim()); } // Extract from JSON-LD first (most reliable) const jsonLdElements = document.querySelectorAll('script[type="application/ld+json"]'); let recipeStructuredData = null; for (const element of jsonLdElements) { try { const parsed = JSON.parse(element.textContent); let recipeData = null; if (parsed['@type'] === 'Recipe') { recipeData = parsed; } else if (Array.isArray(parsed['@type']) && parsed['@type'].includes('Recipe')) { recipeData = parsed; } else if (parsed['@graph']) { const recipeItem = parsed['@graph'].find(item => item['@type'] === 'Recipe'); if (recipeItem) { recipeData = recipeItem; } } if (recipeData) { recipeStructuredData = recipeData; break; } } catch (e) { // Ignore JSON parse errors } } // If we have structured data, extract from it if (recipeStructuredData) { // Add recipe description if (recipeStructuredData.description) { content.push(recipeStructuredData.description); } // Add ingredients heading // content.push('Ingredients'); // Removing label // Add ingredients if (Array.isArray(recipeStructuredData.recipeIngredient)) { recipeStructuredData.recipeIngredient.forEach(ingredient => { content.push(ingredient); }); } // Add instructions heading // content.push('Instructions'); // Removing label // Add instructions if (Array.isArray(recipeStructuredData.recipeInstructions)) { recipeStructuredData.recipeInstructions.forEach((instruction, index) => { const instructionText = typeof instruction === 'string' ? instruction : (instruction.text || ''); if (instructionText) { content.push(`${index + 1}. ${instructionText}`); } }); } // Add recipe metadata const recipeMetadata = []; if (recipeStructuredData.prepTime) { recipeMetadata.push(`Prep Time: ${recipeStructuredData.prepTime}`); } if (recipeStructuredData.cookTime) { recipeMetadata.push(`Cook Time: ${recipeStructuredData.cookTime}`); } if (recipeStructuredData.totalTime) { recipeMetadata.push(`Total Time: ${recipeStructuredData.totalTime}`); } if (recipeStructuredData.recipeYield) { recipeMetadata.push(`Servings: ${recipeStructuredData.recipeYield}`); } if (recipeMetadata.length > 0) { // content.push('Recipe Information'); // Removing label recipeMetadata.forEach(item => content.push(item)); } return content; } // If no structured data, extract from HTML // Try to find recipe container const recipeContainerSelectors = [ '.recipe', '.recipe-container', '.recipe-card', '.recipe-content', '.recipe-body', '.recipe-main', '[itemtype*="Recipe"]', '[typeof*="Recipe"]', 'article', 'main', '#content', '.content', '.post-content', '.entry-content' ]; let recipeContainer = null; for (const selector of recipeContainerSelectors) { try { const container = document.querySelector(selector); if (container) { recipeContainer = container; break; } } catch (e) { // Ignore errors with selectors } } if (!recipeContainer) { recipeContainer = document.body; // Fall back to body if no container found } // Extract ingredients let ingredients = []; const ingredientSelectors = [ '[itemprop="recipeIngredient"]', '.ingredients li', '.ingredient-list li', '.recipe-ingredients li', '[class*="ingredient"] li', '[id*="ingredient"] li', 'ul li' // fallback - look at all list items if others don't work ]; for (const selector of ingredientSelectors) { try { const ingredientElements = recipeContainer.querySelectorAll(selector); if (ingredientElements.length > 0) { ingredients = Array.from(ingredientElements) .map(el => el.textContent.trim()) .filter(text => // Filter for likely ingredients (contains measurements or common food items) text.match(/\d+\s*(cup|tbsp|tsp|tablespoon|teaspoon|oz|ounce|pound|lb|gram|g|ml|l)/i) !== null || text.length > 3 ); if (ingredients.length > 0) { break; } } } catch (e) { // Ignore errors with selectors } } // If no ingredients found with selectors, try looking for paragraphs after an "Ingredients" heading if (ingredients.length === 0) { const headings = Array.from(recipeContainer.querySelectorAll('h1, h2, h3, h4, h5, h6')); for (const heading of headings) { if (heading.textContent.trim().toLowerCase().includes('ingredient')) { // Get all paragraphs and list items that follow this heading let currentNode = heading.nextElementSibling; while (currentNode && !currentNode.tagName.match(/^H[1-6]$/)) { if (currentNode.tagName === 'UL' || currentNode.tagName === 'OL') { const items = Array.from(currentNode.querySelectorAll('li')) .map(li => li.textContent.trim()) .filter(text => text.length > 3); if (items.length > 0) { ingredients = items; break; } } else if (currentNode.tagName === 'P') { const text = currentNode.textContent.trim(); if (text.length > 3) { ingredients.push(text); } } currentNode = currentNode.nextElementSibling; } if (ingredients.length > 0) break; } } } // Extract instructions let instructions = []; const instructionSelectors = [ '[itemprop="recipeInstructions"] li', '.instructions li', '.recipe-instructions li', '.steps li', '.method li', '.directions li', '[class*="instruction"] li', '[id*="instruction"] li', '[class*="direction"] li', '[id*="direction"] li', '[class*="step"] li', '[id*="step"] li', 'ol li' // fallback - numbered lists are often instructions ]; for (const selector of instructionSelectors) { try { const instructionElements = recipeContainer.querySelectorAll(selector); if (instructionElements.length > 0) { instructions = Array.from(instructionElements) .map(el => el.textContent.trim()) .filter(text => text.length > 10); // Instructions are usually longer if (instructions.length > 0) { break; } } } catch (e) { // Ignore errors with selectors } } // If no list items found for instructions, look for paragraphs in instruction container if (instructions.length === 0) { const instructionContainerSelectors = [ '[itemprop="recipeInstructions"]', '.instructions', '.recipe-instructions', '.steps', '.method', '.directions', '[class*="instruction"]', '[id*="instruction"]', '[class*="direction"]', '[id*="direction"]', '[class*="step"]', '[id*="step"]' ]; for (const selector of instructionContainerSelectors) { try { const container = recipeContainer.querySelector(selector); if (container) { const paragraphs = container.querySelectorAll('p'); if (paragraphs.length > 0) { instructions = Array.from(paragraphs) .map(el => el.textContent.trim()) .filter(text => text.length > 10); if (instructions.length > 0) { break; } } } } catch (e) { // Ignore errors with selectors } } } // If still no instructions, try looking for paragraphs after an "Instructions" heading if (instructions.length === 0) { const headings = Array.from(recipeContainer.querySelectorAll('h1, h2, h3, h4, h5, h6')); for (const heading of headings) { if (heading.textContent.trim().toLowerCase().match(/instruction|direction|method|step|preparation/)) { // Get all paragraphs and list items that follow this heading let currentNode = heading.nextElementSibling; while (currentNode && !currentNode.tagName.match(/^H[1-6]$/)) { if (currentNode.tagName === 'OL' || currentNode.tagName === 'UL') { const items = Array.from(currentNode.querySelectorAll('li')) .map(li => li.textContent.trim()) .filter(text => text.length > 10); if (items.length > 0) { instructions = items; break; } } else if (currentNode.tagName === 'P') { const text = currentNode.textContent.trim(); if (text.length > 10) { instructions.push(text); } } currentNode = currentNode.nextElementSibling; } if (instructions.length > 0) break; } } } // Build content from extracted elements // If we have ingredients, add them if (ingredients.length > 0) { // content.push('Ingredients'); // Removing label ingredients.forEach(ingredient => { content.push(ingredient); }); } // If we have instructions, add them if (instructions.length > 0) { // content.push('Instructions'); // Removing label instructions.forEach((instruction, index) => { // Add numbering if not already numbered if (!instruction.match(/^\d+[\.\)]/)) { content.push(`${index + 1}. ${instruction}`); } else { content.push(instruction); } }); } // If we still don't have content, extract paragraphs from the recipe container if (content.length === 0) { const paragraphs = recipeContainer.querySelectorAll('p'); if (paragraphs.length > 0) { Array.from(paragraphs) .map(p => p.textContent.trim()) .filter(text => text.length > 20) .forEach(text => content.push(text)); } } return content; }); return recipeData; } catch (error) { return []; } } /** * Extracts content from product pages */ async extractFromProduct() { try { const productContent = await this.page.evaluate(() => { const results = []; // REI-specific extraction const isREIProduct = window.location.href.includes('rei.com'); if (isREIProduct) { try { // Try REI-specific selectors first const productTitle = document.querySelector('[data-ui="product-title"]')?.textContent.trim() || document.querySelector('h1')?.textContent.trim(); if (productTitle) { results.push(`Product Title: ${productTitle}`); } const brandName = document.querySelector('[data-ui="product-brand"]')?.textContent.trim(); if (brandName) { results.push(`Brand: ${brandName}`); } const productPrice = document.querySelector('[data-ui="sale-price"]')?.textContent.trim() || document.querySelector('[data-ui="display-price"]')?.textContent.trim(); if (productPrice) { results.push(`Price: ${productPrice}`); } // Extract product description const descriptionElem = document.querySelector('.product-information-container'); if (descriptionElem) { const descText = descriptionElem.textContent.trim(); if (descText.length > 0) { results.push(`Description: ${descText}`); } } // Extract specifications const specSections = document.querySelectorAll('.pdp-accordion-content'); for (const section of specSections) { const sectionTitle = section.previousElementSibling?.textContent.trim(); if (sectionTitle) { results.push(`Section: ${sectionTitle}`); } const specItems = section.querySelectorAll('li, p'); for (const item of specItems) { const itemText = item.textContent.trim(); if (itemText.length > 0) { results.push(`- ${itemText}`); } } } // If we found good content, return it if (results.length > 0) { return results; } } catch (e) { // If REI-specific extraction fails, continue with generic extraction console.error('REI-specific extraction failed:', e); } } // Generic extraction for other sites // Extract product title const titleSelectors = [ '[itemprop="name"]', '.product-title', '.product-name', '.product__title', 'h1.title', '[data-testid="product-title"]', '.pdp-title', 'h1', '#productTitle' ]; for (const selector of titleSelectors) { try { const titleElement = document.querySelector(selector); if (titleElement && titleElement.textContent.trim().length > 0) { results.push(`Product Title: ${titleElement.textContent.trim()}`); break; } } catch (e) { continue; } } // Extract price const priceSelectors = [ '[itemprop="price"]', '.price', '.product-price', '.product__price', '[data-testid="price"]', '.pdp-price', '.current-price', '#priceblock_ourprice', '.price-characteristic' ]; for (const selector of priceSelectors) { try { const priceElement = document.querySelector(selector); if (priceElement && priceElement.textContent.trim().length > 0) { results.push(`Price: ${priceElement.textContent.trim()}`); break; } } catch (e) { continue; } } // Extract description const descriptionSelectors = [ '[itemprop="description"]', '.product-description', '.description', '.product__description', '#description', '.pdp-description', '[data-testid="product-description"]', '#productDescription', '[data-component-type="s-product-description"]' ]; for (const selector of descriptionSelectors) { try { const descElements = document.querySelectorAll(selector); if (descElements.length > 0) { for (const elem of descElements) { const text = elem.textContent.trim(); if (text.length > 0) { results.push(`Description: ${text}`); } } } } catch (e) { continue; } } // Extract features/specs const featureSelectors = [ '.product-features', '.features', '.specifications', '.specs', '.product-specs', '.tech-specs', '[data-testid="product-specs"]', '#feature-bullets', '.product-attributes', '.accordion-inner' ]; for (const selector of featureSelectors) { try { const featureElements = document.querySelectorAll(`${selector} li, ${selector} p, ${selector} div`); if (featureElements.length > 0) { for (const elem of featureElements) { const text = elem.textContent.trim(); if (text.length > 10 && text.length < 500) { // Reasonable feature length results.push(`Feature: ${text}`); } } } } catch (e) { continue; } } // If we have very little content, try more aggressive extraction if (results.length < 3) { // Look for product details in any container const possibleContainers = document.querySelectorAll('.product-details, .product-info, [class*="product-"], [class*="pdp-"], [id*="product-"]'); for (const container of possibleContainers) { // Skip tiny containers or hidden elements if (!container.offsetParent || container.textContent.trim().length < 50) continue; // Get structured content like paragraphs, lists, etc. const textElements = container.querySelectorAll('p, li, h3, h4, h5, h6'); for (const elem of textElements) { const text = elem.textContent.trim(); if (text.length > 10 && !results.includes(text)) { results.push(text); } } // If no structured elements, get the container text if (textElements.length === 0 && container.textContent.trim().length > 50) { results.push(container.textContent.trim().substring(0, 500) + (container.textContent.trim().length > 500 ? '...' : '')); } } } // Last resort: grab heading and all nearby paragraphs if (results.length < 2) { const h1 = document.querySelector('h1'); if (h1) { if (!results.some(r => r.includes(h1.textContent.trim()))) { results.push(`Product Title: ${h1.textContent.trim()}`); } let sibling = h1.nextElementSibling; while (sibling && results.length < 10) { if (sibling.tagName === 'P' || sibling.tagName === 'DIV') { const text = sibling.textContent.trim(); if (text.length > 20 && !results.includes(text)) { results.push(text); } } sibling = sibling.nextElementSibling; } } } return results; }); return productContent; } catch (error) { return []; } } /** * Extracts content from documentation pages */ async extractFromDocumentation() { try { const docContent = await this.page.evaluate(() => { const results = []; // MDN-specific extraction const isMDN = window.location.href.includes('mozilla.org') || window.location.href.includes('mdn.'); if (isMDN) { try { // Try MDN-specific selectors first const mainContent = document.querySelector('.main-page-content, .article, #content-main, .article__content'); if (mainContent) { // Extract title and headings const title = document.querySelector('h1')?.textContent.trim(); if (title) { results.push(`Title: ${title}`); } // Extract all section headings and their content const headings = mainContent.querySelectorAll('h2, h3, h4, h5, h6'); for (const heading of headings) { const headingText = heading.textContent.trim(); if (headingText) { results.push(`\n${headingText}`); // Get all content until next heading let nextElem = heading.nextElementSibling; while (nextElem && !['H2', 'H3', 'H4', 'H5', 'H6'].includes(nextElem.tagName)) { // Only add paragraphs, lists, and code blocks if (['P', 'UL', 'OL', 'PRE', 'CODE', 'DL', 'TABLE'].includes(nextElem.tagName)) { const text = nextElem.textContent.trim(); if (text.length > 0) { results.push(text); } } nextElem = nextElem.nextElementSibling; } } } // If we found good content, return it if (results.length > 0) { return results; } } } catch (e) { // If MDN-specific extraction fails, continue with generic extraction console.error('MDN-specific extraction failed:', e); } } // Try documentation-specific selectors const docSelectors = [ '.documentation', '.docs', '.doc-content', '.article__content', '.article-content', '.documentation__main', '.documentation__content', '.markdown-body', '.markdown-section', '