UNPKG

weavebot-core

Version:

Generic content processing framework for web scraping and AI extraction

496 lines (490 loc) 15.2 kB
// src/processors/web-scraper.ts import { chromium } from "playwright"; import * as cheerio from "cheerio"; // src/types/index.ts var WeaveBotError = class extends Error { type; context; constructor(message, type, context = {}) { super(message); this.name = "WeaveBotError"; this.type = type; this.context = context; } }; var ScrapingError = class extends WeaveBotError { constructor(message, context = {}) { super(message, "SCRAPING_FAILED", context); this.name = "ScrapingError"; } }; var ExtractionError = class extends WeaveBotError { constructor(message, context = {}) { super(message, "EXTRACTION_FAILED", context); this.name = "ExtractionError"; } }; // src/interfaces/web-scraper-plugin.ts var WebScraperPluginRegistry = class { plugins = []; register(plugin) { this.plugins.push(plugin); } findPlugin(url) { return this.plugins.find((plugin) => plugin.canHandle(url)) || null; } getAll() { return [...this.plugins]; } }; // src/utils/index.ts var Logger = class { logLevel; constructor(level = "info") { this.logLevel = this.parseLogLevel(level); if (process.env.NODE_ENV === "production") { this.setupProductionLogging(); } } parseLogLevel(level) { switch (level.toLowerCase()) { case "debug": return 0 /* DEBUG */; case "info": return 1 /* INFO */; case "warn": return 2 /* WARN */; case "error": return 3 /* ERROR */; default: return 1 /* INFO */; } } setupProductionLogging() { const originalLog = console.log; const originalWarn = console.warn; const originalError = console.error; console.log = (...args) => { if (typeof args[0] === "object" && args[0].level !== void 0) { originalLog(JSON.stringify(args[0])); } else { originalLog(...args); } }; console.warn = (...args) => { if (typeof args[0] === "object" && args[0].level !== void 0) { originalWarn(JSON.stringify(args[0])); } else { originalWarn(...args); } }; console.error = (...args) => { if (typeof args[0] === "object" && args[0].level !== void 0) { originalError(JSON.stringify(args[0])); } else { originalError(...args); } }; } shouldLog(level) { return level >= this.logLevel; } createLogEntry(level, message, context, error) { return { timestamp: (/* @__PURE__ */ new Date()).toISOString(), level, message, context: context ? this.sanitizeContext(context) : void 0, error: error ? { name: error.name, message: error.message, stack: error.stack } : void 0 }; } sanitizeContext(context) { const sanitized = { ...context }; const sensitiveKeys = ["password", "token", "key", "secret", "apikey"]; Object.keys(sanitized).forEach((key) => { if (sensitiveKeys.some((sensitive) => key.toLowerCase().includes(sensitive))) { sanitized[key] = "[REDACTED]"; } }); return sanitized; } formatLogEntry(entry) { const levelNames = { [0 /* DEBUG */]: "DEBUG", [1 /* INFO */]: "INFO", [2 /* WARN */]: "WARN", [3 /* ERROR */]: "ERROR" }; let formatted = `[${entry.timestamp}] ${levelNames[entry.level]}: ${entry.message}`; if (entry.context && Object.keys(entry.context).length > 0) { formatted += ` | Context: ${JSON.stringify(entry.context)}`; } if (entry.error) { formatted += ` | Error: ${entry.error.message}`; if (entry.error.stack && this.logLevel === 0 /* DEBUG */) { formatted += ` Stack: ${entry.error.stack}`; } } return formatted; } output(entry) { if (process.env.NODE_ENV === "production") { const structuredLog = { timestamp: entry.timestamp, level: entry.level, message: entry.message, ...entry.context, error: entry.error }; switch (entry.level) { case 3 /* ERROR */: console.error(structuredLog); break; case 2 /* WARN */: console.warn(structuredLog); break; default: console.log(structuredLog); } } else { const formatted = this.formatLogEntry(entry); switch (entry.level) { case 3 /* ERROR */: console.error(`\x1B[31m${formatted}\x1B[0m`); break; case 2 /* WARN */: console.warn(`\x1B[33m${formatted}\x1B[0m`); break; case 1 /* INFO */: console.log(`\x1B[36m${formatted}\x1B[0m`); break; case 0 /* DEBUG */: console.log(`\x1B[90m${formatted}\x1B[0m`); break; } } } debug(message, context) { if (this.shouldLog(0 /* DEBUG */)) { const entry = this.createLogEntry(0 /* DEBUG */, message, context); this.output(entry); } } info(message, context) { if (this.shouldLog(1 /* INFO */)) { const entry = this.createLogEntry(1 /* INFO */, message, context); this.output(entry); } } warn(message, context) { if (this.shouldLog(2 /* WARN */)) { const entry = this.createLogEntry(2 /* WARN */, message, context); this.output(entry); } } error(message, contextOrError, error) { if (this.shouldLog(3 /* ERROR */)) { let context; let actualError; if (contextOrError instanceof Error) { actualError = contextOrError; } else { context = contextOrError; actualError = error; } const entry = this.createLogEntry(3 /* ERROR */, message, context, actualError); this.output(entry); } } // Performance logging utility time(label, context) { const startTime = Date.now(); this.debug(`Timer started: ${label}`, context); return () => { const duration = Date.now() - startTime; this.info(`Timer ended: ${label}`, { ...context, duration }); }; } }; var createLogger = (level) => { return new Logger(level || process.env.LOG_LEVEL || "info"); }; var defaultLogger = createLogger(); // src/processors/web-scraper.ts var WebScraperProcessor = class { name = "web-scraper"; description = "Scrapes web content with configurable strategies"; config; pluginRegistry; browser = null; constructor(config = {}) { this.config = config; this.pluginRegistry = new WebScraperPluginRegistry(); if (config.plugins) { config.plugins.forEach((plugin) => this.pluginRegistry.register(plugin)); } } async execute(params, context = {}) { const { url, strategy, options } = params; const startTime = Date.now(); try { defaultLogger.info("Starting web scraping", { url, strategy: strategy || "auto", ...context }); const plugin = this.pluginRegistry.findPlugin(url); const pluginConfig = plugin?.getConfig(url); const finalConfig = { strategy: strategy || pluginConfig?.strategy || "auto", timeout: options?.timeout || pluginConfig?.timeout || this.config.defaultTimeout || 3e4, viewport: this.config.defaultViewport || pluginConfig?.viewport, waitSelectors: options?.waitForSelector ? [options.waitForSelector] : pluginConfig?.waitSelectors, headers: options?.userAgent ? { "User-Agent": options.userAgent } : pluginConfig?.headers, blockResources: pluginConfig?.blockResources, additionalWait: pluginConfig?.additionalWait }; const content = await this.scrapeWithConfig(url, finalConfig, plugin); const duration = Date.now() - startTime; defaultLogger.info("Web scraping completed", { url, plugin: plugin?.name || "none", duration, ...context }); return content; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); defaultLogger.error("Web scraping failed", { error: errorMessage, url, ...context }); throw new ScrapingError(`Failed to scrape ${url}: ${errorMessage}`, { ...context }); } } async scrapeWithConfig(url, config, plugin) { const page = await this.getPage(); try { if (config.viewport) { await page.setViewportSize(config.viewport); } if (config.headers) { await page.setExtraHTTPHeaders(config.headers); } if (config.blockResources) { await page.route("**/*", (route) => { const resourceType = route.request().resourceType(); if (["image", "stylesheet", "font", "media"].includes(resourceType)) { route.abort(); } else { route.continue(); } }); } await page.goto(url, { waitUntil: config.strategy === "spa" ? "networkidle" : "domcontentloaded", timeout: config.timeout }); const context = { url, logger: defaultLogger, state: {} }; if (plugin?.beforeExtract) { await plugin.beforeExtract(page, context); } if (config.waitSelectors && config.waitSelectors.length > 0) { await Promise.all( config.waitSelectors.map( (selector) => page.waitForSelector(selector, { timeout: 5e3 }).catch(() => { }) ) ); } if (config.additionalWait) { await page.waitForTimeout(config.additionalWait); } const content = await this.extractContent(page, url); if (plugin?.extractMetadata) { const metadata = await plugin.extractMetadata(page, context); content.metadata = { ...content.metadata || {}, ...metadata }; } const finalContent = plugin?.postProcess ? plugin.postProcess(content, context) : content; return finalContent; } finally { await page.close(); } } async extractContent(page, url) { const html = await page.content(); const $ = cheerio.load(html); $("script, style, noscript").remove(); const text = $("body").text().replace(/\s+/g, " ").trim(); const title = $("title").text() || $("h1").first().text() || ""; const metadata = { description: $('meta[name="description"]').attr("content") || "", keywords: $('meta[name="keywords"]').attr("content") || "", author: $('meta[name="author"]').attr("content") || "", ogTitle: $('meta[property="og:title"]').attr("content") || "", ogDescription: $('meta[property="og:description"]').attr("content") || "", ogImage: $('meta[property="og:image"]').attr("content") || "" }; const jsonLdScripts = $('script[type="application/ld+json"]'); if (jsonLdScripts.length > 0) { metadata.jsonLd = []; jsonLdScripts.each((_, element) => { try { const jsonLd = JSON.parse($(element).html() || "{}"); metadata.jsonLd.push(jsonLd); } catch (e) { } }); } return { url, title, text, html, metadata, extractedAt: /* @__PURE__ */ new Date() }; } async getPage() { if (!this.browser) { this.browser = await chromium.launch({ headless: this.config.headless !== false }); } return await this.browser.newPage(); } async cleanup() { if (this.browser) { await this.browser.close(); this.browser = null; } } /** * Register a plugin for platform-specific handling */ registerPlugin(plugin) { this.pluginRegistry.register(plugin); } }; function createWebScraper(config) { return new WebScraperProcessor(config); } // src/processors/ai-extractor.ts import { generateObject } from "ai"; import { openai } from "@ai-sdk/openai"; import { google } from "@ai-sdk/google"; var AIExtractionProcessor = class { name = "ai-extractor"; description = "Extracts structured data from content using AI models"; model; config; extractionConfigs = /* @__PURE__ */ new Map(); constructor(config) { this.config = config; this.model = this.createModel(config); } createModel(config) { const provider = config.provider || "openai"; const apiKey = config.apiKey || ""; if (provider === "google" && apiKey) { process.env.GOOGLE_GENERATIVE_AI_API_KEY = apiKey; } switch (provider) { case "openai": return openai(config.model || "gpt-4o"); case "google": return google(config.model || "gemini-2.5-flash"); default: throw new Error(`Unsupported AI provider: ${provider}`); } } /** * Register a custom extraction configuration for a schema type */ registerExtractor(schemaName, config) { this.extractionConfigs.set(schemaName, config); } async execute(params, context = {}) { const { content, schema, url, model } = params; defaultLogger.info("Starting AI extraction", { url: url || content.url, schema: typeof schema === "string" ? schema : "custom", ...context }); try { if (typeof schema === "string") { const config = this.extractionConfigs.get(schema); if (!config) { throw new ExtractionError( `No extraction configuration registered for schema '${schema}'`, context ); } return await this.extractWithConfig(content, config, url || content.url); } const result = await generateObject({ model: model ? this.createModel({ ...this.config, model }) : this.model, schema, prompt: `Extract structured data from this content: ${content.text}`, temperature: this.config.temperature || 0.1 }); return result.object; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); defaultLogger.error("AI extraction failed", { error: errorMessage, url: url || content.url, ...context }); throw new ExtractionError(`Failed to extract data: ${errorMessage}`, { ...context }); } } async extractWithConfig(content, config, url) { const userPrompt = this.interpolatePrompt(config.userPromptTemplate, { content: content.text, url, title: content.title || "", html: content.html || "", examples: config.examples ? config.examples.map((ex) => `Input: ${ex.input} Output: ${JSON.stringify(ex.output)}`).join("\n\n") : "" }); const result = await generateObject({ model: this.model, schema: config.schema, prompt: userPrompt, system: config.systemPrompt, temperature: config.temperature ?? this.config.temperature ?? 0.1 }); return config.postProcess ? config.postProcess(result.object) : result.object; } interpolatePrompt(template, values) { return template.replace(/\{\{(\w+)\}\}/g, (match, key) => { return values[key] || match; }); } }; function createAIExtractor(config) { return new AIExtractionProcessor(config); } export { AIExtractionProcessor, WebScraperProcessor, createAIExtractor, createWebScraper }; //# sourceMappingURL=processors.mjs.map