UNPKG

weavebot-core

Version:

Generic content processing framework for web scraping and AI extraction

186 lines (178 loc) 5.19 kB
import { f as ProcessorPlugin, W as WebScrapingParams, c as ProcessingContext, k as ScrapedContent$1, A as AIExtractionParams } from './index-1GgaWFHn.js'; export { b as ProcessingStep } from './index-1GgaWFHn.js'; import { Page } from 'playwright'; import { z } from 'zod'; /** * Plugin interface for web scraper * Allows platform-specific handling without hardcoding in the core library */ interface WebScraperPlugin { /** * Unique name for the plugin */ name: string; /** * Check if this plugin should handle the given URL */ canHandle(url: string): boolean; /** * Get scraping configuration for this URL */ getConfig(url: string): WebScrapingConfig; /** * Optional: Handle page-specific interactions before content extraction */ beforeExtract?(page: Page, context: PluginContext): Promise<void>; /** * Optional: Extract additional metadata specific to this platform */ extractMetadata?(page: Page, context: PluginContext): Promise<Record<string, any>>; /** * Optional: Post-process the extracted content */ postProcess?(content: ScrapedContent, context: PluginContext): ScrapedContent; } interface WebScrapingConfig { /** * Scraping strategy to use */ strategy: "static" | "spa" | "auto"; /** * Selectors to wait for before extraction */ waitSelectors?: string[]; /** * Maximum time to wait in milliseconds */ timeout?: number; /** * Custom headers to send */ headers?: Record<string, string>; /** * Viewport configuration */ viewport?: { width: number; height: number; }; /** * Whether to block resources (images, stylesheets, etc) */ blockResources?: boolean; /** * Additional wait time after page load */ additionalWait?: number; } interface PluginContext { /** * Original URL being scraped */ url: string; /** * Logger instance */ logger: any; /** * Shared state between plugins */ state?: Record<string, any>; } interface ScrapedContent { url: string; title: string; text: string; html: string; metadata: Record<string, any>; extractedAt: Date; } /** * Generic web scraper with plugin support * Platform-specific logic is handled by plugins */ interface WebScraperConfig { /** * Default timeout for page operations */ defaultTimeout?: number; /** * Whether to run in headless mode */ headless?: boolean; /** * Default viewport size */ defaultViewport?: { width: number; height: number; }; /** * Plugins to register */ plugins?: WebScraperPlugin[]; } declare class WebScraperProcessor implements ProcessorPlugin { readonly name = "web-scraper"; readonly description = "Scrapes web content with configurable strategies"; private config; private pluginRegistry; private browser; constructor(config?: WebScraperConfig); execute(params: WebScrapingParams, context?: ProcessingContext): Promise<ScrapedContent$1>; private scrapeWithConfig; private extractContent; private getPage; cleanup(): Promise<void>; /** * Register a plugin for platform-specific handling */ registerPlugin(plugin: WebScraperPlugin): void; } /** * Factory function to create a web scraper */ declare function createWebScraper(config?: WebScraperConfig): WebScraperProcessor; /** * Generic, schema-driven AI extraction processor * This processor is completely agnostic to the type of content being extracted */ interface AIExtractionConfig { provider?: "openai" | "google"; apiKey?: string; openaiApiKey?: string; model?: string; temperature?: number; } interface GenericExtractionConfig { schema: z.ZodType<any>; systemPrompt: string; userPromptTemplate: string; examples?: Array<{ input: string; output: any; }>; postProcess?: (data: any) => any; temperature?: number; } declare class AIExtractionProcessor implements ProcessorPlugin { readonly name = "ai-extractor"; readonly description = "Extracts structured data from content using AI models"; private model; private config; private extractionConfigs; constructor(config: AIExtractionConfig); private createModel; /** * Register a custom extraction configuration for a schema type */ registerExtractor(schemaName: string, config: GenericExtractionConfig): void; execute(params: AIExtractionParams, context?: ProcessingContext): Promise<unknown>; private extractWithConfig; private interpolatePrompt; } /** * Factory function to create an AI extraction processor */ declare function createAIExtractor(config: AIExtractionConfig): AIExtractionProcessor; export { type AIExtractionConfig, AIExtractionParams, AIExtractionProcessor, type GenericExtractionConfig, ProcessingContext, ProcessorPlugin, ScrapedContent$1 as ScrapedContent, type WebScraperConfig, WebScraperProcessor, WebScrapingParams, createAIExtractor, createWebScraper };