UNPKG

@plust/datasleuth

Version:

Build LLM-powered research pipelines and output structured data.

github.com/PlustOrg/datasleuth

PlustOrg/datasleuth

466 lines • 22.9 kB

JavaScript

/** * Content extraction step for the research pipeline * Extracts content from URLs found in search results */ import { createStep } from '../utils/steps.js'; import axios, { AxiosError } from 'axios'; import * as cheerio from 'cheerio'; import { ExtractionError, NetworkError, ValidationError, } from '../types/errors.js'; import { createStepLogger } from '../utils/logging.js'; /** * Executes content extraction from URLs in search results */ async function executeExtractContentStep(state, options) { const stepLogger = createStepLogger('ContentExtraction'); const { selectors: explicitSelectors, selector, maxUrls = 5, maxContentLength = 10000, minContentLength = 100, includeInResults = false, timeout = 10000, retry = { maxRetries: 2, baseDelay: 500 }, continueOnError = true, requireSuccessful = false, } = options; // Use selectors if provided, otherwise use selector (alias), or fall back to default const selectors = explicitSelectors || selector || 'article, .content, main, #content, .article, .post'; stepLogger.info('Starting content extraction execution'); stepLogger.debug(`Using selectors: ${selectors}`); try { // Get search results from state const searchResults = state.data.searchResults || []; if (searchResults.length === 0) { stepLogger.warn('No search results found for content extraction'); if (requireSuccessful) { throw new ValidationError({ message: 'No search results available for content extraction', step: 'ContentExtraction', suggestions: [ 'Ensure the search step runs successfully before content extraction', 'Check if search step is returning results', 'Consider making this step optional if search results are not guaranteed', ], }); } return state; } // Extract content from each URL (up to maxUrls) const urlsToProcess = searchResults.slice(0, maxUrls); const extractedContents = []; const failedUrls = []; stepLogger.info(`Processing ${urlsToProcess.length} URLs for content extraction`); // Process each URL and extract content for (const result of urlsToProcess) { try { stepLogger.debug(`Extracting content from: ${result.url}`); const startTime = Date.now(); const extractedContent = await extractContentFromURL(result.url, result.title || '', selectors, maxContentLength, timeout, { maxRetries: retry.maxRetries ?? 2, baseDelay: retry.baseDelay ?? 500, }, stepLogger); const extractionTime = Date.now() - startTime; // Ensure content meets minimum length requirement if (extractedContent.content.length < minContentLength) { stepLogger.warn(`Extracted content from ${result.url} is too short (${extractedContent.content.length} chars), skipping`); failedUrls.push({ url: result.url, reason: `Content too short (${extractedContent.content.length} chars)`, }); continue; } // Add extraction time to metadata if (extractedContent.metadata) { extractedContent.metadata.extractionTimeMs = extractionTime; } stepLogger.info(`Successfully extracted ${extractedContent.content.length} chars from ${result.url} in ${extractionTime}ms`); extractedContents.push(extractedContent); } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); stepLogger.error(`Failed to extract content from ${result.url}: ${errorMessage}`); failedUrls.push({ url: result.url, reason: errorMessage, }); // If we should not continue on error, throw if (!continueOnError) { // Determine error type for better error handling if (error instanceof NetworkError) { throw error; // Already a NetworkError } else if (error instanceof Error && (error.message.includes('ECONNREFUSED') || error.message.includes('ETIMEDOUT') || error.message.includes('network'))) { throw new NetworkError({ message: `Network error extracting content from ${result.url}: ${error.message}`, step: 'ContentExtraction', details: { url: result.url, originalError: error }, retry: true, suggestions: [ 'Check your internet connection', 'Verify the URL is accessible', 'Try increasing the timeout value', 'The website might be blocking requests, consider using a different approach', ], }); } else { // Generic extraction error throw new ExtractionError({ message: `Failed to extract content from ${result.url}: ${errorMessage}`, step: 'ContentExtraction', details: { url: result.url, originalError: error }, retry: false, suggestions: [ 'Check if the website structure supports content extraction', 'Try different CSS selectors', "The website might be using JavaScript to render content, which simple extraction can't handle", ], }); } } } } // Check if we have extracted any content if (extractedContents.length === 0 && requireSuccessful) { throw new ExtractionError({ message: 'Failed to extract content from any of the provided URLs', step: 'ContentExtraction', details: { failedUrls }, retry: false, suggestions: [ 'Check if the websites are accessible', 'Try different CSS selectors', 'The websites might be using JavaScript to render content', 'Consider using a more robust extraction method', ], }); } // Calculate statistics const successRate = extractedContents.length / urlsToProcess.length; const totalContentLength = extractedContents.reduce((sum, item) => sum + item.content.length, 0); const avgContentLength = extractedContents.length > 0 ? totalContentLength / extractedContents.length : 0; stepLogger.info(`Extraction complete: ${extractedContents.length}/${urlsToProcess.length} URLs successful (${(successRate * 100).toFixed(1)}%)`); stepLogger.debug(`Average content length: ${avgContentLength.toFixed(0)} characters`); // Update state with extracted content and metadata const newState = { ...state, data: { ...state.data, extractedContent: extractedContents, extractionMetadata: { totalProcessed: urlsToProcess.length, successful: extractedContents.length, failed: failedUrls.length, failedUrls, successRate, totalContentLength, avgContentLength, timestamp: new Date().toISOString(), }, }, }; // Add to results if requested if (includeInResults) { return { ...newState, results: [ ...newState.results, { extractedContent: extractedContents, extractionStats: { successRate, totalContentLength, avgContentLength, successful: extractedContents.length, failed: failedUrls.length, }, }, ], }; } return newState; } catch (error) { // Handle specific error types if (error instanceof NetworkError || error instanceof ExtractionError || error instanceof ValidationError) { // These are already properly formatted, just throw them throw error; } else if (error instanceof AxiosError) { // Format Axios errors specifically const status = error.response?.status; const isNetworkError = !error.response || error.code === 'ECONNABORTED' || error.message.includes('timeout'); if (isNetworkError) { throw new NetworkError({ message: `Network error during content extraction: ${error.message}`, step: 'ContentExtraction', details: { error: error, url: error.config?.url }, retry: true, suggestions: [ 'Check your internet connection', 'Verify the URLs are accessible', 'Try increasing the timeout value', ], }); } else if (status && status >= 400 && status < 500) { throw new ExtractionError({ message: `Client error (${status}) during content extraction: ${error.message}`, step: 'ContentExtraction', details: { error: error, status, url: error.config?.url }, retry: false, suggestions: [ status === 403 ? 'The website is blocking access, consider using a different approach' : status === 404 ? 'The URL does not exist or has been moved' : 'Check if the URL is correct and accessible', ], }); } else if (status && status >= 500) { throw new ExtractionError({ message: `Server error (${status}) during content extraction: ${error.message}`, step: 'ContentExtraction', details: { error: error, status, url: error.config?.url }, retry: true, suggestions: [ 'The website server is experiencing issues', 'Try again later', 'Consider using a different source for information', ], }); } } // Generic error handling throw new ExtractionError({ message: `Error during content extraction: ${error instanceof Error ? error.message : String(error)}`, step: 'ContentExtraction', details: { originalError: error }, retry: false, suggestions: [ 'Check configuration parameters', 'Verify URL formats', 'Inspect the error details for more specific guidance', ], }); } } /** * Extracts content from a URL using the provided selectors */ async function extractContentFromURL(url, title, selectors, maxLength, timeout, retry, stepLogger) { let retries = 0; let lastError = null; // Validate URL try { new URL(url); // Will throw if invalid } catch (error) { throw new ValidationError({ message: `Invalid URL format: ${url}`, step: 'ContentExtraction', details: { url, error }, suggestions: [ 'Check URL format, must be a valid absolute URL', 'Ensure URL includes protocol (http:// or https://)', ], }); } // Attempt with retries while (retries <= retry.maxRetries) { try { // If not the first attempt, delay based on retry count if (retries > 0) { const delayTime = retry.baseDelay * Math.pow(2, retries - 1); // Exponential backoff await new Promise((resolve) => setTimeout(resolve, delayTime)); stepLogger.debug(`Retrying ${url} (attempt ${retries} of ${retry.maxRetries}, delay: ${delayTime}ms)...`); } // Fetch the content const startFetch = Date.now(); const response = await axios.get(url, { timeout, headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Cache-Control': 'max-age=0', 'Sec-Ch-Ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': '"macOS"', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', Referer: 'https://www.google.com/', Connection: 'keep-alive', }, maxRedirects: 5, validateStatus: (status) => status < 400, // Only allow status codes less than 400 }); const fetchTime = Date.now() - startFetch; // Check content type const contentType = response.headers['content-type'] || ''; if (!contentType.includes('html') && !contentType.includes('text')) { throw new ExtractionError({ message: `Unsupported content type: ${contentType}`, step: 'ContentExtraction', details: { url, contentType }, suggestions: [ 'This URL points to non-HTML content that cannot be extracted', 'Try a different URL that contains HTML content', ], }); } // Load the HTML into cheerio const $ = cheerio.load(response.data); // Extract title if not provided or empty if (!title.trim()) { title = $('title').text().trim() || $('h1').first().text().trim() || url; } // Parse the selectors const selectorList = selectors.split(',').map((s) => s.trim()); const matchedSelectors = []; let content = ''; // Try each selector until we find content for (const selector of selectorList) { const elements = $(selector); if (elements.length > 0) { // Add selector to matched list matchedSelectors.push(selector); // Extract text from each element elements.each((_, element) => { // Remove script and style elements $(element).find('script, style').remove(); // Get text content const elementText = $(element).text().trim(); if (elementText) { content += elementText + '\n\n'; } }); } } // If no content was found with specific selectors, try the body if (!content.trim()) { // Remove unwanted elements $('script, style, nav, header, footer, aside, [role=banner], [role=navigation], .sidebar').remove(); // Get body text content = $('body').text().trim(); matchedSelectors.push('body'); } // Clean up content content = content .replace(/\s+/g, ' ') // Replace multiple spaces with single space .replace(/\n\s*\n/g, '\n\n') // Replace multiple newlines with double newline .trim(); // Truncate if necessary const isComplete = content.length <= maxLength; const finalContent = content.length > maxLength ? content.substring(0, maxLength) + '...' : content; // Get domain const domain = new URL(url).hostname; // Create timestamp const extractedAt = new Date().toISOString(); // Calculate word count (approximate) const wordCount = finalContent.split(/\s+/).filter(Boolean).length; // Create metadata const metadata = { wordCount, domain, statusCode: response.status, contentType: response.headers['content-type'], extractedAt, matchedSelectors, isComplete, retryAttempts: retries, }; // Return the extracted content with proper metadata return { url, title, content: finalContent, metadata, extractionDate: extractedAt, // Add the extractionDate field to match the pipeline.ts interface }; } catch (error) { lastError = error; retries++; // Log retry information if (retries <= retry.maxRetries) { stepLogger.warn(`Extraction attempt ${retries} failed for ${url}: ${lastError.message}`); } // If we've exhausted all retries, format and throw appropriate error if (retries > retry.maxRetries) { if (error instanceof AxiosError) { if (!error.response || error.code === 'ECONNABORTED' || error.message.includes('timeout')) { throw new NetworkError({ message: `Network error fetching ${url} after ${retry.maxRetries} retries: ${lastError.message}`, step: 'ContentExtraction', details: { url, error, attempts: retries }, retry: true, suggestions: [ 'Check your internet connection', 'The website may be temporarily unavailable', 'Try increasing the timeout value', 'Consider using a different URL', ], }); } else if (error.response && error.response.status >= 400) { throw new ExtractionError({ message: `HTTP error (${error.response.status}) fetching ${url} after ${retry.maxRetries} retries`, step: 'ContentExtraction', details: { url, status: error.response.status, error }, retry: error.response.status >= 500, // Server errors can be retried, client errors usually can't suggestions: [ error.response.status === 403 ? 'The website is blocking access, consider using a different source' : error.response.status === 404 ? 'The URL does not exist or has been moved' : error.response.status >= 500 ? 'The website server is experiencing issues, try again later' : 'Check if the URL is correct and accessible', ], }); } } // For other errors, use a generic ExtractionError throw new ExtractionError({ message: `Failed to extract content from ${url} after ${retry.maxRetries} retries: ${lastError.message}`, step: 'ContentExtraction', details: { url, error: lastError, attempts: retries }, retry: false, suggestions: [ 'Try different CSS selectors', 'The website might be using JavaScript to render content', 'Consider using a more robust extraction method', ], }); } } } // This should never happen due to the throw in the catch block, // but TypeScript requires a return statement throw lastError || new Error(`Failed to extract content from ${url}`); } /** * Creates a content extraction step for the research pipeline * * @param options Configuration options for content extraction * @returns A content extraction step for the research pipeline */ export function extractContent(options = {}) { return createStep('ContentExtraction', // Wrapper function that matches the expected signature async (state, opts) => { return executeExtractContentStep(state, options); }, options, { // Mark as retryable by default for the entire step retryable: true, maxRetries: options.retry?.maxRetries || 2, retryDelay: options.retry?.baseDelay || 500, backoffFactor: 2, // Mark as optional unless explicitly required optional: !options.requireSuccessful, }); } //# sourceMappingURL=extractContent.js.map