UNPKG

@plust/datasleuth

Version:

Build LLM-powered research pipelines and output structured data.

82 lines (81 loc) 2.74 kB
/** * Content extraction step for the research pipeline * Extracts content from URLs found in search results */ import { createStep } from '../utils/steps.js'; import { StepOptions } from '../types/pipeline.js'; /** * Options for the content extraction step */ export interface ExtractContentOptions extends StepOptions { /** CSS selectors to extract content from */ selectors?: string; /** Alias for selectors (for backwards compatibility) */ selector?: string; /** Maximum number of URLs to process */ maxUrls?: number; /** Maximum content length per URL (characters) */ maxContentLength?: number; /** Whether to include the extracted content in the final results */ includeInResults?: boolean; /** Timeout for each URL fetch in milliseconds */ timeout?: number; /** Fetch retry configuration */ retry?: { /** Maximum number of retries */ maxRetries?: number; /** Base delay between retries in ms */ baseDelay?: number; }; /** Minimum content length to consider a successful extraction */ minContentLength?: number; /** Whether to continue if some URLs fail to extract */ continueOnError?: boolean; /** Whether to require at least one successful extraction */ requireSuccessful?: boolean; } /** * Interface for extracted content metadata */ export interface ExtractedContentMetadata { /** Approximate word count in the content */ wordCount: number; /** Domain of the source website */ domain: string; /** HTTP status code of the response */ statusCode: number; /** MIME type of the content */ contentType?: string; /** Extraction timestamp */ extractedAt: string; /** Which selectors matched and were used */ matchedSelectors?: string[]; /** Was this a complete extraction or partial */ isComplete?: boolean; /** Extraction time in milliseconds */ extractionTimeMs?: number; /** Number of retry attempts made */ retryAttempts?: number; } /** * Interface for extracted content */ export interface ExtractedContent { /** URL of the extracted content */ url: string; /** Title of the content */ title: string; /** The extracted text content */ content: string; /** Additional metadata about the extraction */ metadata?: ExtractedContentMetadata; /** Extraction date */ extractionDate?: string; } /** * Creates a content extraction step for the research pipeline * * @param options Configuration options for content extraction * @returns A content extraction step for the research pipeline */ export declare function extractContent(options?: ExtractContentOptions): ReturnType<typeof createStep>;