weavebot-core
Version:
Generic content processing framework for web scraping and AI extraction
186 lines (178 loc) • 5.19 kB
text/typescript
import { f as ProcessorPlugin, W as WebScrapingParams, c as ProcessingContext, k as ScrapedContent$1, A as AIExtractionParams } from './index-1GgaWFHn.mjs';
export { b as ProcessingStep } from './index-1GgaWFHn.mjs';
import { Page } from 'playwright';
import { z } from 'zod';
/**
* Plugin interface for web scraper
* Allows platform-specific handling without hardcoding in the core library
*/
interface WebScraperPlugin {
/**
* Unique name for the plugin
*/
name: string;
/**
* Check if this plugin should handle the given URL
*/
canHandle(url: string): boolean;
/**
* Get scraping configuration for this URL
*/
getConfig(url: string): WebScrapingConfig;
/**
* Optional: Handle page-specific interactions before content extraction
*/
beforeExtract?(page: Page, context: PluginContext): Promise<void>;
/**
* Optional: Extract additional metadata specific to this platform
*/
extractMetadata?(page: Page, context: PluginContext): Promise<Record<string, any>>;
/**
* Optional: Post-process the extracted content
*/
postProcess?(content: ScrapedContent, context: PluginContext): ScrapedContent;
}
interface WebScrapingConfig {
/**
* Scraping strategy to use
*/
strategy: "static" | "spa" | "auto";
/**
* Selectors to wait for before extraction
*/
waitSelectors?: string[];
/**
* Maximum time to wait in milliseconds
*/
timeout?: number;
/**
* Custom headers to send
*/
headers?: Record<string, string>;
/**
* Viewport configuration
*/
viewport?: {
width: number;
height: number;
};
/**
* Whether to block resources (images, stylesheets, etc)
*/
blockResources?: boolean;
/**
* Additional wait time after page load
*/
additionalWait?: number;
}
interface PluginContext {
/**
* Original URL being scraped
*/
url: string;
/**
* Logger instance
*/
logger: any;
/**
* Shared state between plugins
*/
state?: Record<string, any>;
}
interface ScrapedContent {
url: string;
title: string;
text: string;
html: string;
metadata: Record<string, any>;
extractedAt: Date;
}
/**
* Generic web scraper with plugin support
* Platform-specific logic is handled by plugins
*/
interface WebScraperConfig {
/**
* Default timeout for page operations
*/
defaultTimeout?: number;
/**
* Whether to run in headless mode
*/
headless?: boolean;
/**
* Default viewport size
*/
defaultViewport?: {
width: number;
height: number;
};
/**
* Plugins to register
*/
plugins?: WebScraperPlugin[];
}
declare class WebScraperProcessor implements ProcessorPlugin {
readonly name = "web-scraper";
readonly description = "Scrapes web content with configurable strategies";
private config;
private pluginRegistry;
private browser;
constructor(config?: WebScraperConfig);
execute(params: WebScrapingParams, context?: ProcessingContext): Promise<ScrapedContent$1>;
private scrapeWithConfig;
private extractContent;
private getPage;
cleanup(): Promise<void>;
/**
* Register a plugin for platform-specific handling
*/
registerPlugin(plugin: WebScraperPlugin): void;
}
/**
* Factory function to create a web scraper
*/
declare function createWebScraper(config?: WebScraperConfig): WebScraperProcessor;
/**
* Generic, schema-driven AI extraction processor
* This processor is completely agnostic to the type of content being extracted
*/
interface AIExtractionConfig {
provider?: "openai" | "google";
apiKey?: string;
openaiApiKey?: string;
model?: string;
temperature?: number;
}
interface GenericExtractionConfig {
schema: z.ZodType<any>;
systemPrompt: string;
userPromptTemplate: string;
examples?: Array<{
input: string;
output: any;
}>;
postProcess?: (data: any) => any;
temperature?: number;
}
declare class AIExtractionProcessor implements ProcessorPlugin {
readonly name = "ai-extractor";
readonly description = "Extracts structured data from content using AI models";
private model;
private config;
private extractionConfigs;
constructor(config: AIExtractionConfig);
private createModel;
/**
* Register a custom extraction configuration for a schema type
*/
registerExtractor(schemaName: string, config: GenericExtractionConfig): void;
execute(params: AIExtractionParams, context?: ProcessingContext): Promise<unknown>;
private extractWithConfig;
private interpolatePrompt;
}
/**
* Factory function to create an AI extraction processor
*/
declare function createAIExtractor(config: AIExtractionConfig): AIExtractionProcessor;
export { type AIExtractionConfig, AIExtractionParams, AIExtractionProcessor, type GenericExtractionConfig, ProcessingContext, ProcessorPlugin, ScrapedContent$1 as ScrapedContent, type WebScraperConfig, WebScraperProcessor, WebScrapingParams, createAIExtractor, createWebScraper };