@plust/datasleuth
Version:
Build LLM-powered research pipelines and output structured data.
82 lines (81 loc) • 2.74 kB
TypeScript
/**
* Content extraction step for the research pipeline
* Extracts content from URLs found in search results
*/
import { createStep } from '../utils/steps.js';
import { StepOptions } from '../types/pipeline.js';
/**
* Options for the content extraction step
*/
export interface ExtractContentOptions extends StepOptions {
/** CSS selectors to extract content from */
selectors?: string;
/** Alias for selectors (for backwards compatibility) */
selector?: string;
/** Maximum number of URLs to process */
maxUrls?: number;
/** Maximum content length per URL (characters) */
maxContentLength?: number;
/** Whether to include the extracted content in the final results */
includeInResults?: boolean;
/** Timeout for each URL fetch in milliseconds */
timeout?: number;
/** Fetch retry configuration */
retry?: {
/** Maximum number of retries */
maxRetries?: number;
/** Base delay between retries in ms */
baseDelay?: number;
};
/** Minimum content length to consider a successful extraction */
minContentLength?: number;
/** Whether to continue if some URLs fail to extract */
continueOnError?: boolean;
/** Whether to require at least one successful extraction */
requireSuccessful?: boolean;
}
/**
* Interface for extracted content metadata
*/
export interface ExtractedContentMetadata {
/** Approximate word count in the content */
wordCount: number;
/** Domain of the source website */
domain: string;
/** HTTP status code of the response */
statusCode: number;
/** MIME type of the content */
contentType?: string;
/** Extraction timestamp */
extractedAt: string;
/** Which selectors matched and were used */
matchedSelectors?: string[];
/** Was this a complete extraction or partial */
isComplete?: boolean;
/** Extraction time in milliseconds */
extractionTimeMs?: number;
/** Number of retry attempts made */
retryAttempts?: number;
}
/**
* Interface for extracted content
*/
export interface ExtractedContent {
/** URL of the extracted content */
url: string;
/** Title of the content */
title: string;
/** The extracted text content */
content: string;
/** Additional metadata about the extraction */
metadata?: ExtractedContentMetadata;
/** Extraction date */
extractionDate?: string;
}
/**
* Creates a content extraction step for the research pipeline
*
* @param options Configuration options for content extraction
* @returns A content extraction step for the research pipeline
*/
export declare function extractContent(options?: ExtractContentOptions): ReturnType<typeof createStep>;