crawlforge-mcp-server
Version:
CrawlForge MCP Server - Professional Model Context Protocol server with 19 comprehensive web scraping, crawling, and content processing tools.
615 lines (543 loc) • 23.2 kB
JavaScript
import dotenv from 'dotenv';
import { fileURLToPath } from 'url';
import { dirname, join } from 'path';
// Load environment variables
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
dotenv.config({ path: join(__dirname, '../../.env'), quiet: true });
export const config = {
// Search Provider Configuration
search: {
provider: process.env.SEARCH_PROVIDER || 'auto', // 'google', 'duckduckgo', or 'auto'
// Google Search API
google: {
apiKey: process.env.GOOGLE_API_KEY || '',
searchEngineId: process.env.GOOGLE_SEARCH_ENGINE_ID || ''
},
// DuckDuckGo Configuration
duckduckgo: {
timeout: parseInt(process.env.DUCKDUCKGO_TIMEOUT || '30000'),
maxRetries: parseInt(process.env.DUCKDUCKGO_MAX_RETRIES || '3'),
retryDelay: parseInt(process.env.DUCKDUCKGO_RETRY_DELAY || '1000'),
userAgent: process.env.DUCKDUCKGO_USER_AGENT || process.env.USER_AGENT || 'CrawlForge/1.0'
}
},
// Performance
performance: {
maxWorkers: parseInt(process.env.MAX_WORKERS || '10'),
queueConcurrency: parseInt(process.env.QUEUE_CONCURRENCY || '10'),
cacheMaxSize: parseInt(process.env.CACHE_MAX_SIZE || '1000'),
cacheTTL: parseInt(process.env.CACHE_TTL || '3600000'),
cacheEnableDisk: process.env.CACHE_ENABLE_DISK !== 'false',
cacheDir: process.env.CACHE_DIR || './cache'
},
// Rate Limiting
rateLimit: {
requestsPerSecond: parseInt(process.env.RATE_LIMIT_REQUESTS_PER_SECOND || '10'),
requestsPerMinute: parseInt(process.env.RATE_LIMIT_REQUESTS_PER_MINUTE || '100'),
perDomain: process.env.RATE_LIMIT_PER_DOMAIN !== 'false'
},
// Crawling
crawling: {
maxDepth: parseInt(process.env.MAX_CRAWL_DEPTH || '5'),
maxPages: parseInt(process.env.MAX_PAGES_PER_CRAWL || '100'),
respectRobots: process.env.RESPECT_ROBOTS_TXT !== 'false',
userAgent: process.env.USER_AGENT || 'CrawlForge/1.0',
timeout: parseInt(process.env.CRAWL_TIMEOUT || '30000'),
followExternal: process.env.FOLLOW_EXTERNAL_LINKS === 'true'
},
// Search ranking and deduplication
searchProcessing: {
enableRanking: process.env.ENABLE_SEARCH_RANKING !== 'false',
enableDeduplication: process.env.ENABLE_SEARCH_DEDUPLICATION !== 'false',
// Ranking configuration
ranking: {
weights: {
bm25: parseFloat(process.env.RANKING_WEIGHT_BM25 || '0.4'),
semantic: parseFloat(process.env.RANKING_WEIGHT_SEMANTIC || '0.3'),
authority: parseFloat(process.env.RANKING_WEIGHT_AUTHORITY || '0.2'),
freshness: parseFloat(process.env.RANKING_WEIGHT_FRESHNESS || '0.1')
},
bm25: {
k1: parseFloat(process.env.BM25_K1 || '1.5'),
b: parseFloat(process.env.BM25_B || '0.75')
}
},
// Deduplication configuration
deduplication: {
thresholds: {
url: parseFloat(process.env.DEDUP_THRESHOLD_URL || '0.8'),
title: parseFloat(process.env.DEDUP_THRESHOLD_TITLE || '0.75'),
content: parseFloat(process.env.DEDUP_THRESHOLD_CONTENT || '0.7'),
combined: parseFloat(process.env.DEDUP_THRESHOLD_COMBINED || '0.6')
},
strategies: {
urlNormalization: process.env.DEDUP_URL_NORMALIZATION !== 'false',
titleFuzzy: process.env.DEDUP_TITLE_FUZZY !== 'false',
contentSimhash: process.env.DEDUP_CONTENT_SIMHASH !== 'false',
domainClustering: process.env.DEDUP_DOMAIN_CLUSTERING !== 'false'
}
}
},
// Security Configuration
security: {
// SSRF Protection
ssrfProtection: {
enabled: process.env.SSRF_PROTECTION_ENABLED !== 'false',
allowedProtocols: (process.env.ALLOWED_PROTOCOLS || 'http:,https:').split(','),
maxRequestSize: parseInt(process.env.MAX_REQUEST_SIZE || '104857600'), // 100MB
maxTimeout: parseInt(process.env.MAX_REQUEST_TIMEOUT || '60000'), // 60s
maxRedirects: parseInt(process.env.MAX_REDIRECTS || '5'),
allowedDomains: (process.env.ALLOWED_DOMAINS || '').split(',').filter(d => d.trim()),
blockedDomains: (process.env.BLOCKED_DOMAINS || 'localhost,127.0.0.1,0.0.0.0,metadata.google.internal,169.254.169.254,metadata.azure.com').split(',')
},
// Input Validation
inputValidation: {
enabled: process.env.INPUT_VALIDATION_ENABLED !== 'false',
maxStringLength: parseInt(process.env.MAX_STRING_LENGTH || '10000'),
maxArrayLength: parseInt(process.env.MAX_ARRAY_LENGTH || '1000'),
maxObjectDepth: parseInt(process.env.MAX_OBJECT_DEPTH || '10'),
maxRegexLength: parseInt(process.env.MAX_REGEX_LENGTH || '500'),
strictMode: process.env.STRICT_VALIDATION_MODE === 'true'
},
// API Security
apiSecurity: {
requireAuthentication: process.env.REQUIRE_AUTHENTICATION === 'true',
apiKeyHeader: process.env.API_KEY_HEADER || 'X-API-Key',
apiKey: process.env.API_KEY || '',
rateLimitByKey: process.env.RATE_LIMIT_BY_KEY === 'true',
auditLogging: process.env.AUDIT_LOGGING !== 'false'
},
// Content Security
contentSecurity: {
sanitizeHTML: process.env.SANITIZE_HTML !== 'false',
allowedHTMLTags: (process.env.ALLOWED_HTML_TAGS || 'p,br,strong,em,u,h1,h2,h3,h4,h5,h6').split(','),
blockScripts: process.env.BLOCK_SCRIPTS !== 'false',
blockIframes: process.env.BLOCK_IFRAMES !== 'false'
}
},
// Monitoring
monitoring: {
enableMetrics: process.env.ENABLE_METRICS === 'true',
logLevel: process.env.LOG_LEVEL || 'info',
securityLogging: process.env.SECURITY_LOGGING !== 'false',
violationLogging: process.env.VIOLATION_LOGGING !== 'false'
},
// Server
server: {
nodeEnv: process.env.NODE_ENV || 'development',
port: parseInt(process.env.PORT || '3000'),
enableSecurityHeaders: process.env.ENABLE_SECURITY_HEADERS !== 'false'
},
// Stealth Mode Configuration
stealth: {
// Global stealth settings
enabled: process.env.STEALTH_MODE_ENABLED === 'true',
defaultLevel: process.env.STEALTH_LEVEL || 'medium', // 'basic', 'medium', 'advanced'
// Browser fingerprinting
fingerprinting: {
randomizeUserAgent: process.env.STEALTH_RANDOMIZE_USER_AGENT !== 'false',
randomizeViewport: process.env.STEALTH_RANDOMIZE_VIEWPORT !== 'false',
spoofTimezone: process.env.STEALTH_SPOOF_TIMEZONE !== 'false',
hideWebDriver: process.env.STEALTH_HIDE_WEBDRIVER !== 'false',
blockWebRTC: process.env.STEALTH_BLOCK_WEBRTC !== 'false',
customUserAgent: process.env.STEALTH_CUSTOM_USER_AGENT || null
},
// Human behavior simulation
humanBehavior: {
enabled: process.env.STEALTH_HUMAN_BEHAVIOR_ENABLED !== 'false',
mouseMovements: process.env.STEALTH_MOUSE_MOVEMENTS !== 'false',
naturalTyping: process.env.STEALTH_NATURAL_TYPING !== 'false',
scrollBehavior: process.env.STEALTH_SCROLL_BEHAVIOR !== 'false',
idlePeriods: process.env.STEALTH_IDLE_PERIODS !== 'false',
readingSimulation: process.env.STEALTH_READING_SIMULATION !== 'false',
// Timing configurations
mouseSpeed: process.env.STEALTH_MOUSE_SPEED || 'normal', // 'slow', 'normal', 'fast'
typingSpeed: process.env.STEALTH_TYPING_SPEED || 'normal', // 'slow', 'normal', 'fast'
typingVariability: parseFloat(process.env.STEALTH_TYPING_VARIABILITY || '0.3'), // 0.0 to 1.0
mistakeFrequency: parseFloat(process.env.STEALTH_MISTAKE_FREQUENCY || '0.02'), // 2% mistake rate
// Idle period settings
idleFrequency: parseFloat(process.env.STEALTH_IDLE_FREQUENCY || '0.1'), // 10% chance
idleMinDuration: parseInt(process.env.STEALTH_IDLE_MIN_DURATION || '1000'), // 1 second
idleMaxDuration: parseInt(process.env.STEALTH_IDLE_MAX_DURATION || '5000'), // 5 seconds
// Click behavior
hoverBeforeClick: process.env.STEALTH_HOVER_BEFORE_CLICK !== 'false',
clickDelayMin: parseInt(process.env.STEALTH_CLICK_DELAY_MIN || '100'),
clickDelayMax: parseInt(process.env.STEALTH_CLICK_DELAY_MAX || '300')
},
// Advanced anti-detection
antiDetection: {
bypassHeadlessDetection: process.env.STEALTH_BYPASS_HEADLESS !== 'false',
spoofPlugins: process.env.STEALTH_SPOOF_PLUGINS !== 'false',
spoofPermissions: process.env.STEALTH_SPOOF_PERMISSIONS !== 'false',
mockBattery: process.env.STEALTH_MOCK_BATTERY !== 'false',
preventCanvasFingerprinting: process.env.STEALTH_PREVENT_CANVAS !== 'false',
preventWebGLFingerprinting: process.env.STEALTH_PREVENT_WEBGL !== 'false',
networkEmulation: process.env.STEALTH_NETWORK_EMULATION === 'true'
},
// Resource optimization for stealth
resources: {
blockImages: process.env.STEALTH_BLOCK_IMAGES === 'true',
blockFonts: process.env.STEALTH_BLOCK_FONTS === 'true',
blockStylesheets: process.env.STEALTH_BLOCK_CSS === 'true',
allowTrackingPixels: process.env.STEALTH_ALLOW_TRACKING === 'true', // Allow some tracking to appear normal
maxConcurrentContexts: parseInt(process.env.STEALTH_MAX_CONTEXTS || '5')
},
// Geolocation spoofing
geolocation: {
enabled: process.env.STEALTH_SPOOF_GEOLOCATION === 'true',
latitude: parseFloat(process.env.STEALTH_LATITUDE || '40.7128'), // NYC default
longitude: parseFloat(process.env.STEALTH_LONGITUDE || '-74.0060'),
accuracy: parseInt(process.env.STEALTH_LOCATION_ACCURACY || '100')
}
},
// Localization Configuration
localization: {
// Global localization settings
enabled: process.env.LOCALIZATION_ENABLED === 'true',
defaultCountry: process.env.DEFAULT_COUNTRY_CODE || 'US',
defaultLanguage: process.env.DEFAULT_LANGUAGE || 'en-US',
// Proxy configuration for geo-specific access
proxy: {
enabled: process.env.LOCALIZATION_PROXY_ENABLED === 'true',
rotation: {
enabled: process.env.PROXY_ROTATION_ENABLED === 'true',
interval: parseInt(process.env.PROXY_ROTATION_INTERVAL || '300000'), // 5 minutes
strategy: process.env.PROXY_ROTATION_STRATEGY || 'round-robin'
},
healthCheck: {
enabled: process.env.PROXY_HEALTH_CHECK_ENABLED !== 'false',
interval: parseInt(process.env.PROXY_HEALTH_CHECK_INTERVAL || '300000'), // 5 minutes
timeout: parseInt(process.env.PROXY_HEALTH_CHECK_TIMEOUT || '10000')
},
fallback: {
enabled: process.env.PROXY_FALLBACK_ENABLED !== 'false',
maxRetries: parseInt(process.env.PROXY_MAX_RETRIES || '3'),
timeout: parseInt(process.env.PROXY_TIMEOUT || '10000')
}
},
// Translation services
translation: {
enabled: process.env.TRANSLATION_ENABLED === 'true',
defaultProvider: process.env.TRANSLATION_PROVIDER || 'google',
autoDetect: process.env.TRANSLATION_AUTO_DETECT !== 'false',
preserveFormatting: process.env.TRANSLATION_PRESERVE_FORMAT !== 'false',
cacheEnabled: process.env.TRANSLATION_CACHE_ENABLED !== 'false',
cacheTTL: parseInt(process.env.TRANSLATION_CACHE_TTL || '86400000') // 24 hours
},
// Geo-blocking bypass
geoBlocking: {
autoBypass: process.env.GEO_BLOCKING_AUTO_BYPASS === 'true',
maxRetries: parseInt(process.env.GEO_BLOCKING_MAX_RETRIES || '3'),
retryDelay: parseInt(process.env.GEO_BLOCKING_RETRY_DELAY || '2000'),
fallbackCountries: (process.env.GEO_BLOCKING_FALLBACK_COUNTRIES || 'US,GB,DE,CA').split(','),
detectionSensitivity: process.env.GEO_BLOCKING_DETECTION_SENSITIVITY || 'medium'
},
// Cultural browsing simulation
cultural: {
enabled: process.env.CULTURAL_SIMULATION_ENABLED === 'true',
adaptBehavior: process.env.CULTURAL_ADAPT_BEHAVIOR !== 'false',
adaptTiming: process.env.CULTURAL_ADAPT_TIMING !== 'false',
respectRTL: process.env.CULTURAL_RESPECT_RTL !== 'false'
},
// DNS configuration
dns: {
enabled: process.env.LOCALIZATION_DNS_ENABLED === 'true',
overHttps: process.env.DNS_OVER_HTTPS === 'true',
customResolvers: process.env.CUSTOM_DNS_RESOLVERS ?
JSON.parse(process.env.CUSTOM_DNS_RESOLVERS) : {},
preferredCountry: process.env.DNS_PREFERRED_COUNTRY || null
}
}
};
// Validate required configuration
export function validateConfig() {
const errors = [];
// Check search provider configuration
const provider = getActiveSearchProvider();
if (config.server.nodeEnv === 'production') {
if (provider === 'google') {
if (!config.search.google.apiKey) {
errors.push('GOOGLE_API_KEY is required when using Google search provider in production');
}
if (!config.search.google.searchEngineId) {
errors.push('GOOGLE_SEARCH_ENGINE_ID is required when using Google search provider in production');
}
}
if (!isSearchConfigured()) {
errors.push('Search provider is not properly configured');
}
}
// Validate search provider setting
const validProviders = ['google', 'duckduckgo', 'auto'];
if (!validProviders.includes(config.search.provider.toLowerCase())) {
errors.push(`Invalid SEARCH_PROVIDER value. Must be one of: ${validProviders.join(', ')}`);
}
// Validate numeric ranges
if (config.crawling.maxDepth > 10) {
errors.push('MAX_CRAWL_DEPTH should not exceed 10 for performance reasons');
}
if (config.crawling.maxPages > 10000) {
errors.push('MAX_PAGES_PER_CRAWL should not exceed 10000 for memory reasons');
}
if (config.performance.queueConcurrency > 50) {
errors.push('QUEUE_CONCURRENCY should not exceed 50 to avoid overwhelming servers');
}
// Validate localization configuration
const localizationErrors = validateLocalizationConfig();
errors.push(...localizationErrors);
return errors;
}
// Check if search is properly configured
export function isSearchConfigured() {
const provider = getActiveSearchProvider();
switch (provider) {
case 'google':
return !!(config.search.google.apiKey && config.search.google.searchEngineId);
case 'duckduckgo':
return true; // DuckDuckGo doesn't require API credentials
default:
return false;
}
}
// Get the active search provider based on configuration and availability
export function getActiveSearchProvider() {
const configuredProvider = config.search.provider.toLowerCase();
switch (configuredProvider) {
case 'google':
return 'google';
case 'duckduckgo':
return 'duckduckgo';
case 'auto':
default:
// Auto mode: prefer Google if credentials available, otherwise use DuckDuckGo
if (config.search.google.apiKey && config.search.google.searchEngineId) {
return 'google';
}
return 'duckduckgo';
}
}
// Get configuration for a specific tool
export function getToolConfig(toolName) {
const provider = getActiveSearchProvider();
const toolConfigs = {
search_web: {
provider: provider,
// Google-specific configuration
google: {
apiKey: config.search.google.apiKey,
searchEngineId: config.search.google.searchEngineId
},
// DuckDuckGo-specific configuration
duckduckgo: {
timeout: config.search.duckduckgo.timeout,
maxRetries: config.search.duckduckgo.maxRetries,
retryDelay: config.search.duckduckgo.retryDelay,
userAgent: config.search.duckduckgo.userAgent
},
// Common configuration
cacheEnabled: config.performance.cacheEnableDisk,
cacheTTL: config.performance.cacheTTL,
rankingOptions: {
weights: config.searchProcessing.ranking.weights,
bm25: config.searchProcessing.ranking.bm25,
cacheEnabled: config.performance.cacheEnableDisk,
cacheTTL: config.performance.cacheTTL
},
deduplicationOptions: {
thresholds: config.searchProcessing.deduplication.thresholds,
strategies: config.searchProcessing.deduplication.strategies,
cacheEnabled: config.performance.cacheEnableDisk,
cacheTTL: config.performance.cacheTTL
}
},
crawl_deep: {
maxDepth: config.crawling.maxDepth,
maxPages: config.crawling.maxPages,
respectRobots: config.crawling.respectRobots,
userAgent: config.crawling.userAgent,
timeout: config.crawling.timeout,
followExternal: config.crawling.followExternal,
concurrency: config.performance.queueConcurrency
},
map_site: {
userAgent: config.crawling.userAgent,
timeout: config.crawling.timeout
},
process_document: {
stealthMode: config.stealth.enabled ? {
enabled: true,
level: config.stealth.defaultLevel,
randomizeFingerprint: config.stealth.fingerprinting.randomizeUserAgent,
hideWebDriver: config.stealth.fingerprinting.hideWebDriver,
blockWebRTC: config.stealth.fingerprinting.blockWebRTC
} : { enabled: false }
},
scrape_with_actions: {
stealthMode: config.stealth.enabled ? {
enabled: true,
level: config.stealth.defaultLevel,
randomizeFingerprint: config.stealth.fingerprinting.randomizeUserAgent,
simulateHumanBehavior: config.stealth.humanBehavior.enabled,
customUserAgent: config.stealth.fingerprinting.customUserAgent,
hideWebDriver: config.stealth.fingerprinting.hideWebDriver,
blockWebRTC: config.stealth.fingerprinting.blockWebRTC
} : { enabled: false },
humanBehavior: config.stealth.humanBehavior.enabled ? {
enabled: true,
mouseMovements: config.stealth.humanBehavior.mouseMovements,
typingVariation: config.stealth.humanBehavior.naturalTyping,
scrollBehavior: config.stealth.humanBehavior.scrollBehavior,
idlePeriods: config.stealth.humanBehavior.idlePeriods,
readingTime: config.stealth.humanBehavior.readingSimulation
} : { enabled: false }
}
};
return toolConfigs[toolName] || {};
}
// Get stealth configuration for specific level
export function getStealthConfig(level = 'medium') {
const baseConfig = {
enabled: true,
level,
randomizeFingerprint: config.stealth.fingerprinting.randomizeUserAgent,
hideWebDriver: config.stealth.fingerprinting.hideWebDriver,
blockWebRTC: config.stealth.fingerprinting.blockWebRTC,
customUserAgent: config.stealth.fingerprinting.customUserAgent
};
// Adjust settings based on level
switch (level) {
case 'basic':
return {
...baseConfig,
randomizeFingerprint: false,
blockWebRTC: true,
hideWebDriver: true
};
case 'advanced':
return {
...baseConfig,
randomizeFingerprint: true,
blockWebRTC: true,
hideWebDriver: true,
spoofTimezone: config.stealth.fingerprinting.spoofTimezone,
preventCanvasFingerprinting: config.stealth.antiDetection.preventCanvasFingerprinting,
preventWebGLFingerprinting: config.stealth.antiDetection.preventWebGLFingerprinting,
networkEmulation: config.stealth.antiDetection.networkEmulation
};
case 'medium':
default:
return baseConfig;
}
}
// Get human behavior configuration for specific level
export function getHumanBehaviorConfig(level = 'medium') {
const baseConfig = {
enabled: config.stealth.humanBehavior.enabled,
mouseMovements: config.stealth.humanBehavior.mouseMovements,
typingVariation: config.stealth.humanBehavior.naturalTyping,
scrollBehavior: config.stealth.humanBehavior.scrollBehavior,
idlePeriods: config.stealth.humanBehavior.idlePeriods,
readingTime: config.stealth.humanBehavior.readingSimulation
};
// Adjust behavior complexity based on level
switch (level) {
case 'basic':
return {
...baseConfig,
mouseMovements: false,
typingVariation: false,
idlePeriods: false
};
case 'advanced':
return {
...baseConfig,
mouseMovements: true,
typingVariation: true,
scrollBehavior: true,
idlePeriods: true,
readingTime: true
};
case 'medium':
default:
return baseConfig;
}
}
// Check if stealth mode is properly configured
export function isStealthConfigured() {
return config.stealth.enabled && (
config.stealth.fingerprinting.randomizeUserAgent ||
config.stealth.fingerprinting.hideWebDriver ||
config.stealth.humanBehavior.enabled
);
}
// Get localization configuration
export function getLocalizationConfig() {
return config.localization;
}
// Check if localization is enabled and properly configured
export function isLocalizationConfigured() {
return config.localization.enabled && (
config.localization.proxy.enabled ||
config.localization.translation.enabled ||
config.localization.geoBlocking.autoBypass
);
}
// Get proxy configuration for localization
export function getProxyConfig() {
return config.localization.proxy;
}
// Get translation configuration
export function getTranslationConfig() {
return config.localization.translation;
}
// Get geo-blocking bypass configuration
export function getGeoBlockingConfig() {
return config.localization.geoBlocking;
}
// Get cultural simulation configuration
export function getCulturalConfig() {
return config.localization.cultural;
}
// Validate localization configuration
export function validateLocalizationConfig() {
const errors = [];
const localizationConfig = config.localization;
if (localizationConfig.enabled) {
// Validate country code
if (!localizationConfig.defaultCountry || localizationConfig.defaultCountry.length !== 2) {
errors.push('DEFAULT_COUNTRY_CODE must be a valid 2-letter country code');
}
// Validate language code
if (!localizationConfig.defaultLanguage || !localizationConfig.defaultLanguage.includes('-')) {
errors.push('DEFAULT_LANGUAGE must be in format language-country (e.g., en-US)');
}
// Validate proxy configuration
if (localizationConfig.proxy.enabled) {
if (localizationConfig.proxy.rotation.interval < 60000) {
errors.push('PROXY_ROTATION_INTERVAL should be at least 60000ms (1 minute)');
}
if (localizationConfig.proxy.healthCheck.interval < 60000) {
errors.push('PROXY_HEALTH_CHECK_INTERVAL should be at least 60000ms (1 minute)');
}
}
// Validate translation configuration
if (localizationConfig.translation.enabled) {
const validProviders = ['google', 'azure', 'libre'];
if (!validProviders.includes(localizationConfig.translation.defaultProvider)) {
errors.push(`TRANSLATION_PROVIDER must be one of: ${validProviders.join(', ')}`);
}
}
// Validate geo-blocking configuration
if (localizationConfig.geoBlocking.autoBypass) {
if (localizationConfig.geoBlocking.maxRetries > 10) {
errors.push('GEO_BLOCKING_MAX_RETRIES should not exceed 10');
}
if (localizationConfig.geoBlocking.retryDelay < 1000) {
errors.push('GEO_BLOCKING_RETRY_DELAY should be at least 1000ms');
}
}
}
return errors;
}
export default config;