weavebot-core
Version:
Generic content processing framework for web scraping and AI extraction
496 lines (490 loc) • 15.2 kB
JavaScript
// src/processors/web-scraper.ts
import { chromium } from "playwright";
import * as cheerio from "cheerio";
// src/types/index.ts
var WeaveBotError = class extends Error {
type;
context;
constructor(message, type, context = {}) {
super(message);
this.name = "WeaveBotError";
this.type = type;
this.context = context;
}
};
var ScrapingError = class extends WeaveBotError {
constructor(message, context = {}) {
super(message, "SCRAPING_FAILED", context);
this.name = "ScrapingError";
}
};
var ExtractionError = class extends WeaveBotError {
constructor(message, context = {}) {
super(message, "EXTRACTION_FAILED", context);
this.name = "ExtractionError";
}
};
// src/interfaces/web-scraper-plugin.ts
var WebScraperPluginRegistry = class {
plugins = [];
register(plugin) {
this.plugins.push(plugin);
}
findPlugin(url) {
return this.plugins.find((plugin) => plugin.canHandle(url)) || null;
}
getAll() {
return [...this.plugins];
}
};
// src/utils/index.ts
var Logger = class {
logLevel;
constructor(level = "info") {
this.logLevel = this.parseLogLevel(level);
if (process.env.NODE_ENV === "production") {
this.setupProductionLogging();
}
}
parseLogLevel(level) {
switch (level.toLowerCase()) {
case "debug":
return 0 /* DEBUG */;
case "info":
return 1 /* INFO */;
case "warn":
return 2 /* WARN */;
case "error":
return 3 /* ERROR */;
default:
return 1 /* INFO */;
}
}
setupProductionLogging() {
const originalLog = console.log;
const originalWarn = console.warn;
const originalError = console.error;
console.log = (...args) => {
if (typeof args[0] === "object" && args[0].level !== void 0) {
originalLog(JSON.stringify(args[0]));
} else {
originalLog(...args);
}
};
console.warn = (...args) => {
if (typeof args[0] === "object" && args[0].level !== void 0) {
originalWarn(JSON.stringify(args[0]));
} else {
originalWarn(...args);
}
};
console.error = (...args) => {
if (typeof args[0] === "object" && args[0].level !== void 0) {
originalError(JSON.stringify(args[0]));
} else {
originalError(...args);
}
};
}
shouldLog(level) {
return level >= this.logLevel;
}
createLogEntry(level, message, context, error) {
return {
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
level,
message,
context: context ? this.sanitizeContext(context) : void 0,
error: error ? {
name: error.name,
message: error.message,
stack: error.stack
} : void 0
};
}
sanitizeContext(context) {
const sanitized = { ...context };
const sensitiveKeys = ["password", "token", "key", "secret", "apikey"];
Object.keys(sanitized).forEach((key) => {
if (sensitiveKeys.some((sensitive) => key.toLowerCase().includes(sensitive))) {
sanitized[key] = "[REDACTED]";
}
});
return sanitized;
}
formatLogEntry(entry) {
const levelNames = {
[0 /* DEBUG */]: "DEBUG",
[1 /* INFO */]: "INFO",
[2 /* WARN */]: "WARN",
[3 /* ERROR */]: "ERROR"
};
let formatted = `[${entry.timestamp}] ${levelNames[entry.level]}: ${entry.message}`;
if (entry.context && Object.keys(entry.context).length > 0) {
formatted += ` | Context: ${JSON.stringify(entry.context)}`;
}
if (entry.error) {
formatted += ` | Error: ${entry.error.message}`;
if (entry.error.stack && this.logLevel === 0 /* DEBUG */) {
formatted += `
Stack: ${entry.error.stack}`;
}
}
return formatted;
}
output(entry) {
if (process.env.NODE_ENV === "production") {
const structuredLog = {
timestamp: entry.timestamp,
level: entry.level,
message: entry.message,
...entry.context,
error: entry.error
};
switch (entry.level) {
case 3 /* ERROR */:
console.error(structuredLog);
break;
case 2 /* WARN */:
console.warn(structuredLog);
break;
default:
console.log(structuredLog);
}
} else {
const formatted = this.formatLogEntry(entry);
switch (entry.level) {
case 3 /* ERROR */:
console.error(`\x1B[31m${formatted}\x1B[0m`);
break;
case 2 /* WARN */:
console.warn(`\x1B[33m${formatted}\x1B[0m`);
break;
case 1 /* INFO */:
console.log(`\x1B[36m${formatted}\x1B[0m`);
break;
case 0 /* DEBUG */:
console.log(`\x1B[90m${formatted}\x1B[0m`);
break;
}
}
}
debug(message, context) {
if (this.shouldLog(0 /* DEBUG */)) {
const entry = this.createLogEntry(0 /* DEBUG */, message, context);
this.output(entry);
}
}
info(message, context) {
if (this.shouldLog(1 /* INFO */)) {
const entry = this.createLogEntry(1 /* INFO */, message, context);
this.output(entry);
}
}
warn(message, context) {
if (this.shouldLog(2 /* WARN */)) {
const entry = this.createLogEntry(2 /* WARN */, message, context);
this.output(entry);
}
}
error(message, contextOrError, error) {
if (this.shouldLog(3 /* ERROR */)) {
let context;
let actualError;
if (contextOrError instanceof Error) {
actualError = contextOrError;
} else {
context = contextOrError;
actualError = error;
}
const entry = this.createLogEntry(3 /* ERROR */, message, context, actualError);
this.output(entry);
}
}
// Performance logging utility
time(label, context) {
const startTime = Date.now();
this.debug(`Timer started: ${label}`, context);
return () => {
const duration = Date.now() - startTime;
this.info(`Timer ended: ${label}`, { ...context, duration });
};
}
};
var createLogger = (level) => {
return new Logger(level || process.env.LOG_LEVEL || "info");
};
var defaultLogger = createLogger();
// src/processors/web-scraper.ts
var WebScraperProcessor = class {
name = "web-scraper";
description = "Scrapes web content with configurable strategies";
config;
pluginRegistry;
browser = null;
constructor(config = {}) {
this.config = config;
this.pluginRegistry = new WebScraperPluginRegistry();
if (config.plugins) {
config.plugins.forEach((plugin) => this.pluginRegistry.register(plugin));
}
}
async execute(params, context = {}) {
const { url, strategy, options } = params;
const startTime = Date.now();
try {
defaultLogger.info("Starting web scraping", {
url,
strategy: strategy || "auto",
...context
});
const plugin = this.pluginRegistry.findPlugin(url);
const pluginConfig = plugin?.getConfig(url);
const finalConfig = {
strategy: strategy || pluginConfig?.strategy || "auto",
timeout: options?.timeout || pluginConfig?.timeout || this.config.defaultTimeout || 3e4,
viewport: this.config.defaultViewport || pluginConfig?.viewport,
waitSelectors: options?.waitForSelector ? [options.waitForSelector] : pluginConfig?.waitSelectors,
headers: options?.userAgent ? { "User-Agent": options.userAgent } : pluginConfig?.headers,
blockResources: pluginConfig?.blockResources,
additionalWait: pluginConfig?.additionalWait
};
const content = await this.scrapeWithConfig(url, finalConfig, plugin);
const duration = Date.now() - startTime;
defaultLogger.info("Web scraping completed", {
url,
plugin: plugin?.name || "none",
duration,
...context
});
return content;
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
defaultLogger.error("Web scraping failed", {
error: errorMessage,
url,
...context
});
throw new ScrapingError(`Failed to scrape ${url}: ${errorMessage}`, {
...context
});
}
}
async scrapeWithConfig(url, config, plugin) {
const page = await this.getPage();
try {
if (config.viewport) {
await page.setViewportSize(config.viewport);
}
if (config.headers) {
await page.setExtraHTTPHeaders(config.headers);
}
if (config.blockResources) {
await page.route("**/*", (route) => {
const resourceType = route.request().resourceType();
if (["image", "stylesheet", "font", "media"].includes(resourceType)) {
route.abort();
} else {
route.continue();
}
});
}
await page.goto(url, {
waitUntil: config.strategy === "spa" ? "networkidle" : "domcontentloaded",
timeout: config.timeout
});
const context = {
url,
logger: defaultLogger,
state: {}
};
if (plugin?.beforeExtract) {
await plugin.beforeExtract(page, context);
}
if (config.waitSelectors && config.waitSelectors.length > 0) {
await Promise.all(
config.waitSelectors.map(
(selector) => page.waitForSelector(selector, { timeout: 5e3 }).catch(() => {
})
)
);
}
if (config.additionalWait) {
await page.waitForTimeout(config.additionalWait);
}
const content = await this.extractContent(page, url);
if (plugin?.extractMetadata) {
const metadata = await plugin.extractMetadata(page, context);
content.metadata = { ...content.metadata || {}, ...metadata };
}
const finalContent = plugin?.postProcess ? plugin.postProcess(content, context) : content;
return finalContent;
} finally {
await page.close();
}
}
async extractContent(page, url) {
const html = await page.content();
const $ = cheerio.load(html);
$("script, style, noscript").remove();
const text = $("body").text().replace(/\s+/g, " ").trim();
const title = $("title").text() || $("h1").first().text() || "";
const metadata = {
description: $('meta[name="description"]').attr("content") || "",
keywords: $('meta[name="keywords"]').attr("content") || "",
author: $('meta[name="author"]').attr("content") || "",
ogTitle: $('meta[property="og:title"]').attr("content") || "",
ogDescription: $('meta[property="og:description"]').attr("content") || "",
ogImage: $('meta[property="og:image"]').attr("content") || ""
};
const jsonLdScripts = $('script[type="application/ld+json"]');
if (jsonLdScripts.length > 0) {
metadata.jsonLd = [];
jsonLdScripts.each((_, element) => {
try {
const jsonLd = JSON.parse($(element).html() || "{}");
metadata.jsonLd.push(jsonLd);
} catch (e) {
}
});
}
return {
url,
title,
text,
html,
metadata,
extractedAt: /* @__PURE__ */ new Date()
};
}
async getPage() {
if (!this.browser) {
this.browser = await chromium.launch({
headless: this.config.headless !== false
});
}
return await this.browser.newPage();
}
async cleanup() {
if (this.browser) {
await this.browser.close();
this.browser = null;
}
}
/**
* Register a plugin for platform-specific handling
*/
registerPlugin(plugin) {
this.pluginRegistry.register(plugin);
}
};
function createWebScraper(config) {
return new WebScraperProcessor(config);
}
// src/processors/ai-extractor.ts
import { generateObject } from "ai";
import { openai } from "@ai-sdk/openai";
import { google } from "@ai-sdk/google";
var AIExtractionProcessor = class {
name = "ai-extractor";
description = "Extracts structured data from content using AI models";
model;
config;
extractionConfigs = /* @__PURE__ */ new Map();
constructor(config) {
this.config = config;
this.model = this.createModel(config);
}
createModel(config) {
const provider = config.provider || "openai";
const apiKey = config.apiKey || "";
if (provider === "google" && apiKey) {
process.env.GOOGLE_GENERATIVE_AI_API_KEY = apiKey;
}
switch (provider) {
case "openai":
return openai(config.model || "gpt-4o");
case "google":
return google(config.model || "gemini-2.5-flash");
default:
throw new Error(`Unsupported AI provider: ${provider}`);
}
}
/**
* Register a custom extraction configuration for a schema type
*/
registerExtractor(schemaName, config) {
this.extractionConfigs.set(schemaName, config);
}
async execute(params, context = {}) {
const { content, schema, url, model } = params;
defaultLogger.info("Starting AI extraction", {
url: url || content.url,
schema: typeof schema === "string" ? schema : "custom",
...context
});
try {
if (typeof schema === "string") {
const config = this.extractionConfigs.get(schema);
if (!config) {
throw new ExtractionError(
`No extraction configuration registered for schema '${schema}'`,
context
);
}
return await this.extractWithConfig(content, config, url || content.url);
}
const result = await generateObject({
model: model ? this.createModel({ ...this.config, model }) : this.model,
schema,
prompt: `Extract structured data from this content:
${content.text}`,
temperature: this.config.temperature || 0.1
});
return result.object;
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
defaultLogger.error("AI extraction failed", {
error: errorMessage,
url: url || content.url,
...context
});
throw new ExtractionError(`Failed to extract data: ${errorMessage}`, {
...context
});
}
}
async extractWithConfig(content, config, url) {
const userPrompt = this.interpolatePrompt(config.userPromptTemplate, {
content: content.text,
url,
title: content.title || "",
html: content.html || "",
examples: config.examples ? config.examples.map((ex) => `Input: ${ex.input}
Output: ${JSON.stringify(ex.output)}`).join("\n\n") : ""
});
const result = await generateObject({
model: this.model,
schema: config.schema,
prompt: userPrompt,
system: config.systemPrompt,
temperature: config.temperature ?? this.config.temperature ?? 0.1
});
return config.postProcess ? config.postProcess(result.object) : result.object;
}
interpolatePrompt(template, values) {
return template.replace(/\{\{(\w+)\}\}/g, (match, key) => {
return values[key] || match;
});
}
};
function createAIExtractor(config) {
return new AIExtractionProcessor(config);
}
export {
AIExtractionProcessor,
WebScraperProcessor,
createAIExtractor,
createWebScraper
};
//# sourceMappingURL=processors.mjs.map