@pinkpixel/prysm-mcp
Version:
MCP server for the Prysm web scraper - enabling AI assistants to scrape web content
1,425 lines (1,246 loc) • 85 kB
JavaScript
/**
* MainExtractor - Content extraction focused on trying all possible methods
*
* This class implements multiple extraction methods and tries all of them
* on every page to maximize content extraction.
*/
// Import default options from utils
const { DEFAULT_OPTIONS } = require('./defaultOptions');
// Import chalk for colored output
const chalk = require('chalk');
// Create helper function for consistent delays
const wait = (ms) => new Promise(resolve => setTimeout(resolve, ms));
// Helper function to sanitize CSS selectors
const sanitizeSelector = (selector) => {
if (!selector) return '';
// Replace invalid characters and escape special characters
return selector
.replace(/:/g, '\\:')
.replace(/\./g, '.')
.replace(/\//g, '\\/')
.replace(/\+/g, '\\+')
.replace(/\[/g, '\\[')
.replace(/\]/g, '\\]')
.replace(/\(/g, '\\(')
.replace(/\)/g, '\\)')
.replace(/!/g, '\\!')
.replace(/@/g, '\\@')
.replace(/,/g, '\\,')
.replace(/~/g, '\\~')
.trim();
};
class MainExtractor {
/**
* Create a new MainExtractor instance
* @param {Object} page - Puppeteer page instance
* @param {Object} options - Options for extraction
*/
constructor(page, options = {}) {
this.page = page;
this.data = {
url: '',
title: '',
content: [],
images: [],
metadata: {}
};
// Apply analysis results if available
this.analysisResult = options.analysisResult || null;
// Configure extractor options from analysis
this.priorityExtractors = [];
this.skipExtractors = [];
// Track the successful extractor to optimize subsequent extraction after pagination
this.successfulExtractor = null;
// Add compatibility for different Puppeteer versions
this.ensureCompatibility();
}
/**
* Ensure compatibility with different Puppeteer versions
*/
ensureCompatibility() {
try {
// Try to add waitForFunction if it doesn't exist
if (!this.page.waitForFunction) {
this.page.waitForFunction = async (pageFunction, options = {}, ...args) => {
return await this.page.evaluate(pageFunction, ...args);
};
}
} catch (error) {
// Silently continue if we can't add compatibility
}
}
/**
* Safe wait function that works with all Puppeteer versions
*/
async safeWait(ms) {
try {
if (this.page.waitForTimeout) {
await this.page.waitForTimeout(ms);
} else {
await new Promise(resolve => setTimeout(resolve, ms));
}
} catch (error) {
// Fallback to setTimeout if there's an error
await new Promise(resolve => setTimeout(resolve, ms));
}
}
/**
* Determines if an extractor should be run based on Smart Scan analysis
* @param {string} extractorName - Name of the extractor method
* @returns {boolean} Whether the extractor should run
*/
shouldRunExtractor(extractorName) {
// Skip if explicitly included in skipExtractors
if (this.skipExtractors.includes(extractorName)) {
return false;
}
// If there are priority extractors defined, only run those
if (this.priorityExtractors.length > 0) {
return this.priorityExtractors.includes(extractorName);
}
// Otherwise run all extractors
return true;
}
/**
* Check if extraction is already sufficient
* @param {Array} content - The extracted content
* @returns {boolean} True if content is sufficient
* @private
*/
_isExtractionSufficient(content) {
if (!content || !Array.isArray(content)) return false;
// Filter out empty items
const validContent = content.filter(item => {
if (typeof item === 'string') return item.trim().length > 0;
if (typeof item === 'object' && item.text) return item.text.trim().length > 0;
return false;
});
// Check content count
if (validContent.length < 2) return false;
// Calculate total text length
const totalLength = validContent.reduce((sum, item) => {
const text = typeof item === 'string' ? item : item.text || '';
return sum + text.length;
}, 0);
// Check if we have at least one heading/title
const hasHeading = validContent.some(item => {
if (typeof item === 'object' && item.tag) {
return ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title'].includes(item.tag);
}
return false;
});
// Check if this is a minimal site (like example.com)
const isMinimalSite = this.page.url().includes('example.com') ||
totalLength < 500 && validContent.length < 5;
// For minimal sites, use much lower thresholds
if (isMinimalSite) {
return hasHeading && validContent.length >= 2;
}
// Consider extraction sufficient if we have:
// 1. Reasonable amount of content (>1000 chars)
// 2. At least one heading
// 3. At least 5 content items
return (totalLength > 1000 && hasHeading && validContent.length >= 5);
}
/**
* Main extraction method that tries all extraction approaches
*/
async extract() {
try {
let newContent = [];
// Extract title if not already set
if (!this.data.title) {
this.data.title = await this.extractTitle();
}
// If we already have a successful extractor from previous extraction,
// only use that one to avoid redundant processing
if (this.successfulExtractor) {
try {
let extractedContent = [];
switch (this.successfulExtractor) {
case 'recipes':
extractedContent = await this.extractFromRecipes();
break;
case 'article':
extractedContent = await this.extractFromArticle();
break;
case 'mainContent':
extractedContent = await this.extractFromMainContent();
break;
case 'semantic':
extractedContent = await this.extractFromSemantic();
break;
case 'headerContentFooter':
extractedContent = await this.extractFromHeaderContentFooter();
break;
case 'multiColumn':
extractedContent = await this.extractFromMultiColumn();
break;
case 'contentSections':
extractedContent = await this.extractFromContentSections();
break;
case 'singleColumn':
extractedContent = await this.extractFromSingleColumn();
break;
case 'largestContent':
extractedContent = await this.extractFromLargest();
break;
case 'product':
extractedContent = await this.extractFromProduct();
break;
case 'documentation':
extractedContent = await this.extractFromDocumentation();
break;
case 'basic':
extractedContent = await this.extractFromBasic();
break;
case 'textDensity':
extractedContent = await this.extractFromTextDensity();
break;
}
if (extractedContent && extractedContent.length > 0) {
newContent.push(...extractedContent);
// Add to this.data.content immediately to check if it's sufficient
const uniqueContent = extractedContent.filter(item => {
if (!item) return false;
const text = typeof item === 'string' ? item : item.text;
if (!text) return false;
return !this.data.content.some(existing => {
const existingText = typeof existing === 'string' ? existing : existing.text;
return existingText === text;
});
});
this.data.content.push(...uniqueContent);
}
} catch (error) {
// If the successful extractor fails, fall back to trying all extractors
this.successfulExtractor = null;
}
}
// If no successful extractor yet, or if it failed, try all extractors
if (!this.successfulExtractor) {
// Sort extractors by priority if provided
const sortedExtractors = this._getSortedExtractors();
// Try each extractor until we get sufficient content
for (const extractorName of sortedExtractors) {
if (this._isExtractionSufficient(this.data.content)) {
// We already have good content, skip remaining extractors
break;
}
try {
let extractedContent = [];
switch (extractorName) {
case 'recipes':
extractedContent = await this.extractFromRecipes();
break;
case 'article':
extractedContent = await this.extractFromArticle();
break;
case 'mainContent':
extractedContent = await this.extractFromMainContent();
break;
case 'semantic':
extractedContent = await this.extractFromSemantic();
break;
case 'headerContentFooter':
extractedContent = await this.extractFromHeaderContentFooter();
break;
case 'multiColumn':
extractedContent = await this.extractFromMultiColumn();
break;
case 'contentSections':
extractedContent = await this.extractFromContentSections();
break;
case 'singleColumn':
extractedContent = await this.extractFromSingleColumn();
break;
case 'largestContent':
extractedContent = await this.extractFromLargest();
break;
case 'product':
extractedContent = await this.extractFromProduct();
break;
case 'documentation':
extractedContent = await this.extractFromDocumentation();
break;
case 'basic':
extractedContent = await this.extractFromBasic();
break;
case 'textDensity':
extractedContent = await this.extractFromTextDensity();
break;
}
if (extractedContent && extractedContent.length > 0) {
newContent.push(...extractedContent);
// Add to this.data.content immediately to check if it's sufficient
const uniqueContent = extractedContent.filter(item => {
if (!item) return false;
const text = typeof item === 'string' ? item : item.text;
if (!text) return false;
return !this.data.content.some(existing => {
const existingText = typeof existing === 'string' ? existing : existing.text;
return existingText === text;
});
});
this.data.content.push(...uniqueContent);
// Check if we now have sufficient content
if (this._isExtractionSufficient(this.data.content)) {
// Set a flag to indicate we found an optimal method
this.foundOptimalMethod = true;
this.optimalMethod = extractorName;
this.successfulExtractor = extractorName; // Store the successful extractor for future use
break;
}
}
} catch (error) {
// Continue to next extractor if one fails
}
}
}
// Always extract images
await this.extractImages();
// Always extract metadata
await this.extractMetadata();
return this.data;
} catch (error) {
return this.data;
}
}
/**
* Simple extraction for basic sites (much faster)
* @private
*/
async _simpleExtraction() {
// Get basic page info
await this.extractTitle();
await this.extractMetadata();
// Extract visible text content with minimal processing
const content = await this.page.evaluate(() => {
// Simple function to get visible text from the page
function getVisibleText(element, depth = 0) {
if (!element) return [];
if (depth > 5) return []; // Limit recursion
// Skip hidden elements
const style = window.getComputedStyle(element);
if (style.display === 'none' || style.visibility === 'hidden') {
return [];
}
// Get text content if this is a text node
if (element.nodeType === Node.TEXT_NODE) {
const text = element.textContent.trim();
return text ? [{ text, tag: 'p' }] : [];
}
// Skip script, style, and certain other elements
const tagName = element.tagName?.toLowerCase();
if (['script', 'style', 'noscript', 'svg', 'iframe'].includes(tagName)) {
return [];
}
// For headings, get text with appropriate tag
if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName)) {
const text = element.textContent.trim();
return text ? [{ text, tag: tagName, important: true }] : [];
}
// For paragraphs or divs with text
if (tagName === 'p' || (tagName === 'div' && element.childNodes.length <= 3)) {
const text = element.textContent.trim();
if (text && text.length > 10) {
return [{ text, tag: tagName }];
}
}
// Recursively process child nodes
let results = [];
for (const child of element.childNodes) {
results = results.concat(getVisibleText(child, depth + 1));
}
return results;
}
// Main content usually in these elements
const mainElements = [
document.querySelector('main'),
document.querySelector('article'),
document.querySelector('#content'),
document.querySelector('.content'),
document.querySelector('#main'),
document.querySelector('.main'),
document.body // Fallback
].filter(Boolean);
// Get text from main content first
let allContent = [];
for (const element of mainElements) {
const content = getVisibleText(element);
if (content.length > 0) {
allContent = allContent.concat(content);
// If we found good content, we can stop
if (content.length > 5) break;
}
}
// Images
const images = Array.from(document.querySelectorAll('img'))
.filter(img => {
const src = img.src;
const width = img.width || 0;
const height = img.height || 0;
return src && src.startsWith('http') && width > 100 && height > 100;
})
.map(img => ({
url: img.src,
alt: img.alt || '',
width: img.width || 0,
height: img.height || 0
}));
return { content: allContent, images };
});
// Add the extracted content and images to our data
if (content.content && content.content.length > 0) {
this.data.content = this.data.content.concat(content.content);
}
if (content.images && content.images.length > 0) {
this.data.images = this.data.images.concat(content.images);
}
// Deduplicate
this._deduplicateContent();
}
/**
* Extracts content from article elements
*/
async extractFromArticle() {
try {
const articleContent = await this.page.evaluate(() => {
const results = [];
const articles = document.querySelectorAll('article');
for (const article of articles) {
// Get all paragraphs, headings, and lists within the article
const elements = article.querySelectorAll('p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote');
for (const elem of elements) {
const text = elem.textContent.trim();
if (text.length > 0) {
results.push(text);
}
}
// If no structured elements found, get the raw text
if (results.length === 0 && article.textContent.trim().length > 0) {
results.push(article.textContent.trim());
}
}
return results;
});
return articleContent;
} catch (error) {
return [];
}
}
/**
* Extracts content from main content elements
*/
async extractFromMainContent() {
try {
const mainContent = await this.page.evaluate(() => {
const results = [];
const mainElements = document.querySelectorAll('main, [role="main"], #main, .main, .content, #content, .post-content, .article-content');
for (const main of mainElements) {
// Get all paragraphs, headings, and lists within the main content
const elements = main.querySelectorAll('p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote');
for (const elem of elements) {
const text = elem.textContent.trim();
if (text.length > 0) {
results.push(text);
}
}
// If no structured elements found, get the raw text
if (results.length === 0 && main.textContent.trim().length > 0) {
results.push(main.textContent.trim());
}
}
return results;
});
return mainContent;
} catch (error) {
return [];
}
}
/**
* Extracts content from header-content-footer structure
*/
async extractFromHeaderContentFooter() {
try {
const content = await this.page.evaluate(() => {
const results = [];
const header = document.querySelector('header');
const footer = document.querySelector('footer');
if (!header || !footer) return results;
// Function to get next sibling elements
function getNextSiblings(elem, filter) {
const siblings = [];
while (elem && elem !== footer) {
elem = elem.nextElementSibling;
if (elem && elem !== footer && (!filter || filter(elem))) {
siblings.push(elem);
}
}
return siblings;
}
// Get all elements between header and footer
const contentElements = getNextSiblings(header, elem => {
// Skip empty elements or navigation
return elem.textContent.trim().length > 0 &&
!elem.matches('nav, aside, .sidebar, .ad, .advertisement');
});
for (const elem of contentElements) {
// Get all text-containing elements
const textElements = elem.querySelectorAll('p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote');
for (const textElem of textElements) {
const text = textElem.textContent.trim();
if (text.length > 0) {
results.push(text);
}
}
// If no text elements found, get the raw text
if (textElements.length === 0 && elem.textContent.trim().length > 0) {
results.push(elem.textContent.trim());
}
}
return results;
});
return content;
} catch (error) {
return [];
}
}
/**
* Extracts content from multi-column layouts
*/
async extractFromMultiColumn() {
try {
const content = await this.page.evaluate(() => {
const results = [];
const columns = document.querySelectorAll('.column, .col, [class*="col-"], [class*="column-"]');
// Find the main content column (usually the largest one with most text)
let mainColumn = null;
let maxTextLength = 0;
for (const column of columns) {
const textLength = column.textContent.trim().length;
if (textLength > maxTextLength) {
maxTextLength = textLength;
mainColumn = column;
}
}
if (mainColumn) {
// Get all paragraphs, headings, and lists within the main column
const elements = mainColumn.querySelectorAll('p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote');
for (const elem of elements) {
const text = elem.textContent.trim();
if (text.length > 0) {
results.push(text);
}
}
// If no structured elements found, get the raw text
if (results.length === 0 && mainColumn.textContent.trim().length > 0) {
results.push(mainColumn.textContent.trim());
}
}
return results;
});
return content;
} catch (error) {
return [];
}
}
/**
* Extracts content from the largest content block on the page
* This is a fallback method when other methods fail
*/
async extractFromLargest() {
try {
const content = await this.page.evaluate(() => {
const results = [];
// Find elements with substantial text content
const allElements = document.querySelectorAll('div, section, main, article');
let bestElement = null;
let maxTextLength = 0;
let maxParagraphs = 0;
for (const elem of allElements) {
// Skip hidden elements, navigation, and other non-content elements
if (
!elem.offsetParent || // Hidden element
elem.matches('nav, header, footer, aside, .sidebar, .ad, .advertisement, .menu') ||
elem.id && /nav|menu|sidebar|footer|header/i.test(elem.id) ||
elem.className && /nav|menu|sidebar|footer|header/i.test(elem.className)
) {
continue;
}
const paragraphs = elem.querySelectorAll('p');
const textLength = elem.textContent.trim().length;
// Prioritize elements with more paragraphs and more text
if (
(paragraphs.length > maxParagraphs) ||
(paragraphs.length === maxParagraphs && textLength > maxTextLength)
) {
maxParagraphs = paragraphs.length;
maxTextLength = textLength;
bestElement = elem;
}
}
if (bestElement) {
// Get all paragraphs, headings, and lists within the best element
const elements = bestElement.querySelectorAll('p, h1, h2, h3, h4, h5, h6, ul, ol, blockquote');
for (const elem of elements) {
const text = elem.textContent.trim();
if (text.length > 0) {
results.push(text);
}
}
// If no structured elements found, get the raw text
if (results.length === 0 && bestElement.textContent.trim().length > 0) {
results.push(bestElement.textContent.trim());
}
}
return results;
});
return content;
} catch (error) {
return [];
}
}
/**
* Extracts content from elements with semantic meaning
* Looks for elements with semantic roles or schema.org attributes
*/
async extractFromSemantic() {
try {
const semanticContent = await this.page.evaluate(() => {
const results = [];
// Look for elements with semantic attributes
const semanticSelectors = [
// ARIA roles related to content
'[role="article"]',
'[role="main"]',
'[role="contentinfo"]',
'[role="document"]',
'[role="region"]',
// Schema.org attributes
'[itemtype*="Article"]',
'[itemtype*="NewsArticle"]',
'[itemtype*="BlogPosting"]',
'[itemtype*="WebPage"]',
'[itemtype*="CreativeWork"]',
// HTML5 semantic elements not covered by other methods
'article',
'section',
// OpenGraph marked content
'[property="og:description"]',
// Common content classes without being overly specific
'.post-content',
'.entry-content',
'.article-content',
'.blog-content',
'.story-content',
'.page-content'
];
// Try each semantic selector
for (const selector of semanticSelectors) {
try {
const elements = document.querySelectorAll(selector);
for (const element of elements) {
// Skip elements with no content or too small content
if (element.textContent.trim().length < 100) continue;
// Skip hidden elements
if (!element.offsetParent) continue;
// Skip navigation, sidebars, etc.
if (
element.matches('nav, aside, header, footer') ||
element.id && /nav|menu|sidebar|header|footer/i.test(element.id) ||
element.className && /nav|menu|sidebar|header|footer/i.test(element.className)
) {
continue;
}
// Get all text elements within this semantic element
const textElements = element.querySelectorAll('p, h1, h2, h3, h4, h5, h6, li, blockquote, figcaption, code, pre');
if (textElements.length > 0) {
for (const textElem of textElements) {
const text = textElem.textContent.trim();
if (text.length > 0) {
results.push(text);
}
}
} else if (element.textContent.trim().length > 150) {
// If no structured elements but has substantial text
results.push(element.textContent.trim());
}
}
} catch (e) {
// Skip this selector if there's an error
continue;
}
}
return results;
});
return semanticContent;
} catch (error) {
return [];
}
}
/**
* Extract content from recipe pages
*/
async extractFromRecipes() {
try {
const recipeData = await this.page.evaluate(() => {
const content = [];
// Try to extract title
const titleElement = document.querySelector('h1');
if (titleElement) {
content.push(titleElement.textContent.trim());
}
// Extract from JSON-LD first (most reliable)
const jsonLdElements = document.querySelectorAll('script[type="application/ld+json"]');
let recipeStructuredData = null;
for (const element of jsonLdElements) {
try {
const parsed = JSON.parse(element.textContent);
let recipeData = null;
if (parsed['@type'] === 'Recipe') {
recipeData = parsed;
} else if (Array.isArray(parsed['@type']) && parsed['@type'].includes('Recipe')) {
recipeData = parsed;
} else if (parsed['@graph']) {
const recipeItem = parsed['@graph'].find(item => item['@type'] === 'Recipe');
if (recipeItem) {
recipeData = recipeItem;
}
}
if (recipeData) {
recipeStructuredData = recipeData;
break;
}
} catch (e) {
// Ignore JSON parse errors
}
}
// If we have structured data, extract from it
if (recipeStructuredData) {
// Add recipe description
if (recipeStructuredData.description) {
content.push(recipeStructuredData.description);
}
// Add ingredients heading
// content.push('Ingredients'); // Removing label
// Add ingredients
if (Array.isArray(recipeStructuredData.recipeIngredient)) {
recipeStructuredData.recipeIngredient.forEach(ingredient => {
content.push(ingredient);
});
}
// Add instructions heading
// content.push('Instructions'); // Removing label
// Add instructions
if (Array.isArray(recipeStructuredData.recipeInstructions)) {
recipeStructuredData.recipeInstructions.forEach((instruction, index) => {
const instructionText = typeof instruction === 'string' ?
instruction :
(instruction.text || '');
if (instructionText) {
content.push(`${index + 1}. ${instructionText}`);
}
});
}
// Add recipe metadata
const recipeMetadata = [];
if (recipeStructuredData.prepTime) {
recipeMetadata.push(`Prep Time: ${recipeStructuredData.prepTime}`);
}
if (recipeStructuredData.cookTime) {
recipeMetadata.push(`Cook Time: ${recipeStructuredData.cookTime}`);
}
if (recipeStructuredData.totalTime) {
recipeMetadata.push(`Total Time: ${recipeStructuredData.totalTime}`);
}
if (recipeStructuredData.recipeYield) {
recipeMetadata.push(`Servings: ${recipeStructuredData.recipeYield}`);
}
if (recipeMetadata.length > 0) {
// content.push('Recipe Information'); // Removing label
recipeMetadata.forEach(item => content.push(item));
}
return content;
}
// If no structured data, extract from HTML
// Try to find recipe container
const recipeContainerSelectors = [
'.recipe', '.recipe-container', '.recipe-card', '.recipe-content',
'.recipe-body', '.recipe-main', '[itemtype*="Recipe"]', '[typeof*="Recipe"]',
'article', 'main', '#content', '.content', '.post-content', '.entry-content'
];
let recipeContainer = null;
for (const selector of recipeContainerSelectors) {
try {
const container = document.querySelector(selector);
if (container) {
recipeContainer = container;
break;
}
} catch (e) {
// Ignore errors with selectors
}
}
if (!recipeContainer) {
recipeContainer = document.body; // Fall back to body if no container found
}
// Extract ingredients
let ingredients = [];
const ingredientSelectors = [
'[itemprop="recipeIngredient"]',
'.ingredients li',
'.ingredient-list li',
'.recipe-ingredients li',
'[class*="ingredient"] li',
'[id*="ingredient"] li',
'ul li' // fallback - look at all list items if others don't work
];
for (const selector of ingredientSelectors) {
try {
const ingredientElements = recipeContainer.querySelectorAll(selector);
if (ingredientElements.length > 0) {
ingredients = Array.from(ingredientElements)
.map(el => el.textContent.trim())
.filter(text =>
// Filter for likely ingredients (contains measurements or common food items)
text.match(/\d+\s*(cup|tbsp|tsp|tablespoon|teaspoon|oz|ounce|pound|lb|gram|g|ml|l)/i) !== null ||
text.length > 3
);
if (ingredients.length > 0) {
break;
}
}
} catch (e) {
// Ignore errors with selectors
}
}
// If no ingredients found with selectors, try looking for paragraphs after an "Ingredients" heading
if (ingredients.length === 0) {
const headings = Array.from(recipeContainer.querySelectorAll('h1, h2, h3, h4, h5, h6'));
for (const heading of headings) {
if (heading.textContent.trim().toLowerCase().includes('ingredient')) {
// Get all paragraphs and list items that follow this heading
let currentNode = heading.nextElementSibling;
while (currentNode && !currentNode.tagName.match(/^H[1-6]$/)) {
if (currentNode.tagName === 'UL' || currentNode.tagName === 'OL') {
const items = Array.from(currentNode.querySelectorAll('li'))
.map(li => li.textContent.trim())
.filter(text => text.length > 3);
if (items.length > 0) {
ingredients = items;
break;
}
} else if (currentNode.tagName === 'P') {
const text = currentNode.textContent.trim();
if (text.length > 3) {
ingredients.push(text);
}
}
currentNode = currentNode.nextElementSibling;
}
if (ingredients.length > 0) break;
}
}
}
// Extract instructions
let instructions = [];
const instructionSelectors = [
'[itemprop="recipeInstructions"] li',
'.instructions li',
'.recipe-instructions li',
'.steps li',
'.method li',
'.directions li',
'[class*="instruction"] li',
'[id*="instruction"] li',
'[class*="direction"] li',
'[id*="direction"] li',
'[class*="step"] li',
'[id*="step"] li',
'ol li' // fallback - numbered lists are often instructions
];
for (const selector of instructionSelectors) {
try {
const instructionElements = recipeContainer.querySelectorAll(selector);
if (instructionElements.length > 0) {
instructions = Array.from(instructionElements)
.map(el => el.textContent.trim())
.filter(text => text.length > 10); // Instructions are usually longer
if (instructions.length > 0) {
break;
}
}
} catch (e) {
// Ignore errors with selectors
}
}
// If no list items found for instructions, look for paragraphs in instruction container
if (instructions.length === 0) {
const instructionContainerSelectors = [
'[itemprop="recipeInstructions"]',
'.instructions',
'.recipe-instructions',
'.steps',
'.method',
'.directions',
'[class*="instruction"]',
'[id*="instruction"]',
'[class*="direction"]',
'[id*="direction"]',
'[class*="step"]',
'[id*="step"]'
];
for (const selector of instructionContainerSelectors) {
try {
const container = recipeContainer.querySelector(selector);
if (container) {
const paragraphs = container.querySelectorAll('p');
if (paragraphs.length > 0) {
instructions = Array.from(paragraphs)
.map(el => el.textContent.trim())
.filter(text => text.length > 10);
if (instructions.length > 0) {
break;
}
}
}
} catch (e) {
// Ignore errors with selectors
}
}
}
// If still no instructions, try looking for paragraphs after an "Instructions" heading
if (instructions.length === 0) {
const headings = Array.from(recipeContainer.querySelectorAll('h1, h2, h3, h4, h5, h6'));
for (const heading of headings) {
if (heading.textContent.trim().toLowerCase().match(/instruction|direction|method|step|preparation/)) {
// Get all paragraphs and list items that follow this heading
let currentNode = heading.nextElementSibling;
while (currentNode && !currentNode.tagName.match(/^H[1-6]$/)) {
if (currentNode.tagName === 'OL' || currentNode.tagName === 'UL') {
const items = Array.from(currentNode.querySelectorAll('li'))
.map(li => li.textContent.trim())
.filter(text => text.length > 10);
if (items.length > 0) {
instructions = items;
break;
}
} else if (currentNode.tagName === 'P') {
const text = currentNode.textContent.trim();
if (text.length > 10) {
instructions.push(text);
}
}
currentNode = currentNode.nextElementSibling;
}
if (instructions.length > 0) break;
}
}
}
// Build content from extracted elements
// If we have ingredients, add them
if (ingredients.length > 0) {
// content.push('Ingredients'); // Removing label
ingredients.forEach(ingredient => {
content.push(ingredient);
});
}
// If we have instructions, add them
if (instructions.length > 0) {
// content.push('Instructions'); // Removing label
instructions.forEach((instruction, index) => {
// Add numbering if not already numbered
if (!instruction.match(/^\d+[\.\)]/)) {
content.push(`${index + 1}. ${instruction}`);
} else {
content.push(instruction);
}
});
}
// If we still don't have content, extract paragraphs from the recipe container
if (content.length === 0) {
const paragraphs = recipeContainer.querySelectorAll('p');
if (paragraphs.length > 0) {
Array.from(paragraphs)
.map(p => p.textContent.trim())
.filter(text => text.length > 20)
.forEach(text => content.push(text));
}
}
return content;
});
return recipeData;
} catch (error) {
return [];
}
}
/**
* Extracts content from product pages
*/
async extractFromProduct() {
try {
const productContent = await this.page.evaluate(() => {
const results = [];
// REI-specific extraction
const isREIProduct = window.location.href.includes('rei.com');
if (isREIProduct) {
try {
// Try REI-specific selectors first
const productTitle = document.querySelector('[data-ui="product-title"]')?.textContent.trim() ||
document.querySelector('h1')?.textContent.trim();
if (productTitle) {
results.push(`Product Title: ${productTitle}`);
}
const brandName = document.querySelector('[data-ui="product-brand"]')?.textContent.trim();
if (brandName) {
results.push(`Brand: ${brandName}`);
}
const productPrice = document.querySelector('[data-ui="sale-price"]')?.textContent.trim() ||
document.querySelector('[data-ui="display-price"]')?.textContent.trim();
if (productPrice) {
results.push(`Price: ${productPrice}`);
}
// Extract product description
const descriptionElem = document.querySelector('.product-information-container');
if (descriptionElem) {
const descText = descriptionElem.textContent.trim();
if (descText.length > 0) {
results.push(`Description: ${descText}`);
}
}
// Extract specifications
const specSections = document.querySelectorAll('.pdp-accordion-content');
for (const section of specSections) {
const sectionTitle = section.previousElementSibling?.textContent.trim();
if (sectionTitle) {
results.push(`Section: ${sectionTitle}`);
}
const specItems = section.querySelectorAll('li, p');
for (const item of specItems) {
const itemText = item.textContent.trim();
if (itemText.length > 0) {
results.push(`- ${itemText}`);
}
}
}
// If we found good content, return it
if (results.length > 0) {
return results;
}
} catch (e) {
// If REI-specific extraction fails, continue with generic extraction
console.error('REI-specific extraction failed:', e);
}
}
// Generic extraction for other sites
// Extract product title
const titleSelectors = [
'[itemprop="name"]',
'.product-title',
'.product-name',
'.product__title',
'h1.title',
'[data-testid="product-title"]',
'.pdp-title',
'h1',
'#productTitle'
];
for (const selector of titleSelectors) {
try {
const titleElement = document.querySelector(selector);
if (titleElement && titleElement.textContent.trim().length > 0) {
results.push(`Product Title: ${titleElement.textContent.trim()}`);
break;
}
} catch (e) {
continue;
}
}
// Extract price
const priceSelectors = [
'[itemprop="price"]',
'.price',
'.product-price',
'.product__price',
'[data-testid="price"]',
'.pdp-price',
'.current-price',
'#priceblock_ourprice',
'.price-characteristic'
];
for (const selector of priceSelectors) {
try {
const priceElement = document.querySelector(selector);
if (priceElement && priceElement.textContent.trim().length > 0) {
results.push(`Price: ${priceElement.textContent.trim()}`);
break;
}
} catch (e) {
continue;
}
}
// Extract description
const descriptionSelectors = [
'[itemprop="description"]',
'.product-description',
'.description',
'.product__description',
'#description',
'.pdp-description',
'[data-testid="product-description"]',
'#productDescription',
'[data-component-type="s-product-description"]'
];
for (const selector of descriptionSelectors) {
try {
const descElements = document.querySelectorAll(selector);
if (descElements.length > 0) {
for (const elem of descElements) {
const text = elem.textContent.trim();
if (text.length > 0) {
results.push(`Description: ${text}`);
}
}
}
} catch (e) {
continue;
}
}
// Extract features/specs
const featureSelectors = [
'.product-features',
'.features',
'.specifications',
'.specs',
'.product-specs',
'.tech-specs',
'[data-testid="product-specs"]',
'#feature-bullets',
'.product-attributes',
'.accordion-inner'
];
for (const selector of featureSelectors) {
try {
const featureElements = document.querySelectorAll(`${selector} li, ${selector} p, ${selector} div`);
if (featureElements.length > 0) {
for (const elem of featureElements) {
const text = elem.textContent.trim();
if (text.length > 10 && text.length < 500) { // Reasonable feature length
results.push(`Feature: ${text}`);
}
}
}
} catch (e) {
continue;
}
}
// If we have very little content, try more aggressive extraction
if (results.length < 3) {
// Look for product details in any container
const possibleContainers = document.querySelectorAll('.product-details, .product-info, [class*="product-"], [class*="pdp-"], [id*="product-"]');
for (const container of possibleContainers) {
// Skip tiny containers or hidden elements
if (!container.offsetParent || container.textContent.trim().length < 50) continue;
// Get structured content like paragraphs, lists, etc.
const textElements = container.querySelectorAll('p, li, h3, h4, h5, h6');
for (const elem of textElements) {
const text = elem.textContent.trim();
if (text.length > 10 && !results.includes(text)) {
results.push(text);
}
}
// If no structured elements, get the container text
if (textElements.length === 0 && container.textContent.trim().length > 50) {
results.push(container.textContent.trim().substring(0, 500) +
(container.textContent.trim().length > 500 ? '...' : ''));
}
}
}
// Last resort: grab heading and all nearby paragraphs
if (results.length < 2) {
const h1 = document.querySelector('h1');
if (h1) {
if (!results.some(r => r.includes(h1.textContent.trim()))) {
results.push(`Product Title: ${h1.textContent.trim()}`);
}
let sibling = h1.nextElementSibling;
while (sibling && results.length < 10) {
if (sibling.tagName === 'P' || sibling.tagName === 'DIV') {
const text = sibling.textContent.trim();
if (text.length > 20 && !results.includes(text)) {
results.push(text);
}
}
sibling = sibling.nextElementSibling;
}
}
}
return results;
});
return productContent;
} catch (error) {
return [];
}
}
/**
* Extracts content from documentation pages
*/
async extractFromDocumentation() {
try {
const docContent = await this.page.evaluate(() => {
const results = [];
// MDN-specific extraction
const isMDN = window.location.href.includes('mozilla.org') || window.location.href.includes('mdn.');
if (isMDN) {
try {
// Try MDN-specific selectors first
const mainContent = document.querySelector('.main-page-content, .article, #content-main, .article__content');
if (mainContent) {
// Extract title and headings
const title = document.querySelector('h1')?.textContent.trim();
if (title) {
results.push(`Title: ${title}`);
}
// Extract all section headings and their content
const headings = mainContent.querySelectorAll('h2, h3, h4, h5, h6');
for (const heading of headings) {
const headingText = heading.textContent.trim();
if (headingText) {
results.push(`\n${headingText}`);
// Get all content until next heading
let nextElem = heading.nextElementSibling;
while (nextElem && !['H2', 'H3', 'H4', 'H5', 'H6'].includes(nextElem.tagName)) {
// Only add paragraphs, lists, and code blocks
if (['P', 'UL', 'OL', 'PRE', 'CODE', 'DL', 'TABLE'].includes(nextElem.tagName)) {
const text = nextElem.textContent.trim();
if (text.length > 0) {
results.push(text);
}
}
nextElem = nextElem.nextElementSibling;
}
}
}
// If we found good content, return it
if (results.length > 0) {
return results;
}
}
} catch (e) {
// If MDN-specific extraction fails, continue with generic extraction
console.error('MDN-specific extraction failed:', e);
}
}
// Try documentation-specific selectors
const docSelectors = [
'.documentation',
'.docs',
'.doc-content',
'.article__content',
'.article-content',
'.documentation__main',
'.documentation__content',
'.markdown-body',
'.markdown-section',
'