mcp-basic-web-crawler
Version:
A Model Context Protocol (MCP) server providing ethical web crawling and search capabilities
721 lines (715 loc) • 29.6 kB
JavaScript
/**
* MCP Basic Web Crawler Server - Consolidated Single File
*
* A Model Context Protocol server providing basic web crawling and search capabilities.
* This consolidated version contains all functionality in a single file for easy deployment.
*/
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
import { CallToolRequestSchema, ListToolsRequestSchema, ErrorCode, McpError, } from '@modelcontextprotocol/sdk/types.js';
import { zodToJsonSchema } from 'zod-to-json-schema';
import { z } from 'zod';
import axios from 'axios';
import * as cheerio from 'cheerio';
import * as os from 'os';
import * as fs from 'fs/promises';
import * as path from 'path';
export const DEFAULT_CONFIG = {
searchRateLimit: 30,
fetchRateLimit: 20,
maxInMemorySize: 5 * 1024 * 1024, // 5MB
requestTimeout: 30000,
maxRedirects: 5,
maxContentLength: 8000,
userAgent: 'Mozilla/5.0 (compatible; MCP-WebCrawler/1.0; +https://github.com/calmren/mcp-basic-web-crawler)',
respectRobots: true,
batchDelay: 500,
};
// ============================================================================
// VALIDATION SCHEMAS
// ============================================================================
export const DuckDuckGoWebSearchArgsSchema = z.object({
query: z.string().describe('Search query string'),
maxResults: z
.number()
.optional()
.default(10)
.describe('Maximum number of results to return (default: 10)'),
});
export const UrlContentExtractorArgsSchema = z.object({
url: z.union([
z.string().describe('The webpage URL to fetch content from'),
z.array(z.string()).describe('List of webpage URLs to get content from'),
]),
});
// ============================================================================
// LOGGER UTILITY
// ============================================================================
export var LogLevel;
(function (LogLevel) {
LogLevel[LogLevel["ERROR"] = 0] = "ERROR";
LogLevel[LogLevel["WARN"] = 1] = "WARN";
LogLevel[LogLevel["INFO"] = 2] = "INFO";
LogLevel[LogLevel["DEBUG"] = 3] = "DEBUG";
})(LogLevel || (LogLevel = {}));
export class Logger {
level;
prefix;
constructor(prefix = 'MCP-WebCrawler', level = LogLevel.INFO) {
this.prefix = prefix;
this.level = level;
}
log(level, message, ...args) {
if (level <= this.level) {
const timestamp = new Date().toISOString();
const levelName = LogLevel[level];
console.log(`[${timestamp}] [${this.prefix}] [${levelName}] ${message}`, ...args);
}
}
error(message, ...args) {
this.log(LogLevel.ERROR, message, ...args);
}
warn(message, ...args) {
this.log(LogLevel.WARN, message, ...args);
}
info(message, ...args) {
this.log(LogLevel.INFO, message, ...args);
}
debug(message, ...args) {
this.log(LogLevel.DEBUG, message, ...args);
}
setLevel(level) {
this.level = level;
}
child(suffix) {
return new Logger(`${this.prefix}:${suffix}`, this.level);
}
}
// ============================================================================
// RATE LIMITER UTILITY
// ============================================================================
export class RateLimiter {
requestsPerMinute;
requests;
constructor(requestsPerMinute = 30) {
this.requestsPerMinute = requestsPerMinute;
this.requests = [];
}
/**
* Acquire permission to make a request, waiting if necessary
*/
async acquire() {
const now = new Date();
// Remove requests older than 1 minute
this.requests = this.requests.filter(req => now.getTime() - req.getTime() < 60 * 1000);
if (this.requests.length >= this.requestsPerMinute) {
// Wait until we can make another request
const oldestRequest = this.requests[0];
const waitTime = 60 - (now.getTime() - oldestRequest.getTime()) / 1000;
if (waitTime > 0) {
await new Promise(resolve => setTimeout(resolve, waitTime * 1000));
}
}
this.requests.push(now);
}
/**
* Get current rate limit status
*/
getStatus() {
const now = new Date();
// Clean up old requests
this.requests = this.requests.filter(req => now.getTime() - req.getTime() < 60 * 1000);
const resetTime = this.requests.length > 0 ? new Date(this.requests[0].getTime() + 60 * 1000) : null;
return {
current: this.requests.length,
limit: this.requestsPerMinute,
resetTime,
};
}
/**
* Update the rate limit
*/
updateLimit(newLimit) {
this.requestsPerMinute = newLimit;
}
}
// ============================================================================
// ARGUMENT PARSING
// ============================================================================
function parseArgs() {
const args = process.argv.slice(2);
const config = { ...DEFAULT_CONFIG };
let logLevel = LogLevel.INFO;
for (let i = 0; i < args.length; i++) {
const arg = args[i];
const nextArg = args[i + 1];
switch (arg) {
case '--search-rate-limit':
if (nextArg && !isNaN(Number(nextArg))) {
config.searchRateLimit = Number(nextArg);
i++;
}
break;
case '--fetch-rate-limit':
if (nextArg && !isNaN(Number(nextArg))) {
config.fetchRateLimit = Number(nextArg);
i++;
}
break;
case '--max-content-length':
if (nextArg && !isNaN(Number(nextArg))) {
config.maxContentLength = Number(nextArg);
i++;
}
break;
case '--timeout':
if (nextArg && !isNaN(Number(nextArg))) {
config.requestTimeout = Number(nextArg);
i++;
}
break;
case '--user-agent':
if (nextArg) {
config.userAgent = nextArg;
i++;
}
break;
case '--log-level':
if (nextArg) {
switch (nextArg.toLowerCase()) {
case 'error':
logLevel = LogLevel.ERROR;
break;
case 'warn':
logLevel = LogLevel.WARN;
break;
case 'info':
logLevel = LogLevel.INFO;
break;
case 'debug':
logLevel = LogLevel.DEBUG;
break;
}
i++;
}
break;
case '--help':
case '-h':
printHelp();
process.exit(0);
break;
}
}
return { config, logLevel };
}
function printHelp() {
console.log(`
MCP Web Crawler Server
A Model Context Protocol server providing basic web crawling and search capabilities.
Usage: mcp-web-crawler [options]
Options:
--search-rate-limit <number> Maximum search requests per minute (default: 30)
--fetch-rate-limit <number> Maximum fetch requests per minute (default: 20)
--max-content-length <number> Maximum content length to return (default: 8000)
--timeout <number> Request timeout in milliseconds (default: 30000)
--user-agent <string> Custom user agent string
--log-level <level> Log level: error, warn, info, debug (default: info)
--help, -h Show this help message
Examples:
mcp-web-crawler
mcp-web-crawler --search-rate-limit 20 --log-level debug
mcp-web-crawler --timeout 60000 --max-content-length 10000
Environment Variables:
MCP_WEB_CRAWLER_LOG_LEVEL Set log level (error, warn, info, debug)
MCP_WEB_CRAWLER_USER_AGENT Set custom user agent
For more information, visit: https://github.com/calmren/mcp-basic-web-crawler
`);
}
// ============================================================================
// DUCKDUCKGO SEARCHER
// ============================================================================
export class DuckDuckGoSearcher {
static BASE_URL = 'https://html.duckduckgo.com/html';
rateLimiter;
logger;
config;
constructor(config, logger) {
this.config = config;
this.rateLimiter = new RateLimiter(config.searchRateLimit);
this.logger = logger.child('DuckDuckGo');
}
/**
* Format search results for LLM consumption
*/
formatResultsForLLM(results) {
if (!results.length) {
return 'No results were found for your search query. Please try rephrasing your search or try again in a few minutes.';
}
const output = [];
output.push(`Found ${results.length} search results:\n`);
for (const result of results) {
output.push(`${result.position}. ${result.title}`);
output.push(` URL: ${result.link}`);
output.push(` Summary: ${result.snippet}`);
output.push(''); // Empty line between results
}
return output.join('\n');
}
/**
* Perform a search query with rate limiting and error handling
*/
async search(query, ctx, maxResults = 10) {
this.logger.info(`Starting search for query: "${query}" (max results: ${maxResults})`);
try {
// Apply rate limiting
await this.rateLimiter.acquire();
this.logger.debug('Rate limit acquired, making request');
// Create form data for POST request
const data = {
q: query,
b: '',
kl: '',
};
const response = await axios.post(DuckDuckGoSearcher.BASE_URL, new URLSearchParams(data), {
headers: {
'User-Agent': this.config.userAgent,
},
timeout: this.config.requestTimeout,
});
this.logger.debug(`Received response with status: ${response.status}`);
// Parse HTML response
const $ = cheerio.load(response.data);
if (!$) {
const errorMsg = 'Failed to parse HTML response';
this.logger.error(errorMsg);
await ctx.error(errorMsg);
return [];
}
const results = [];
$('.result').each((_, element) => {
const titleElem = $(element).find('.result__title');
if (!titleElem.length)
return;
const linkElem = titleElem.find('a');
if (!linkElem.length)
return;
const title = linkElem.text().trim();
let link = linkElem.attr('href') || '';
// Skip ad results
if (link.includes('y.js')) {
this.logger.debug('Skipping ad result');
return;
}
// Clean up DuckDuckGo redirect URLs
if (link.startsWith('//duckduckgo.com/l/?uddg=')) {
link = decodeURIComponent(link.split('uddg=')[1].split('&')[0]);
}
const snippetElem = $(element).find('.result__snippet');
const snippet = snippetElem.length ? snippetElem.text().trim() : '';
results.push({
title,
link,
snippet,
position: results.length + 1,
});
if (results.length >= maxResults) {
return false; // Break out of the loop
}
});
this.logger.info(`Search completed successfully, found ${results.length} results`);
return results;
}
catch (error) {
if (axios.isAxiosError(error) && error.code === 'ECONNABORTED') {
const errorMsg = 'Search request timed out';
this.logger.error(errorMsg);
await ctx.error(errorMsg);
}
else if (axios.isAxiosError(error)) {
const errorMsg = `HTTP error occurred: ${error.message}`;
this.logger.error(errorMsg);
await ctx.error(errorMsg);
}
else {
const errorMsg = `Unexpected error during search: ${error.message}`;
this.logger.error(errorMsg, error);
await ctx.error(errorMsg);
}
return [];
}
}
}
// ============================================================================
// WEB CONTENT FETCHER
// ============================================================================
export class WebContentFetcher {
rateLimiter;
tempFiles = [];
logger;
config;
constructor(config, logger) {
this.config = config;
this.rateLimiter = new RateLimiter(config.fetchRateLimit);
this.logger = logger.child('ContentFetcher');
// Set up cleanup on process exit
process.on('exit', this.cleanup.bind(this));
process.on('SIGINT', () => {
this.cleanup();
process.exit();
});
}
async cleanup() {
this.logger.debug(`Cleaning up ${this.tempFiles.length} temporary files`);
// Clean up temporary files
for (const file of this.tempFiles) {
try {
await fs.unlink(file);
}
catch {
// Ignore errors during cleanup
this.logger.debug(`Failed to clean up temp file: ${file}`);
}
}
}
async getMemoryStats() {
const totalMemory = os.totalmem();
const freeMemory = os.freemem();
const usedMemory = totalMemory - freeMemory;
const usagePercentage = (usedMemory / totalMemory) * 100;
return {
totalMemory,
freeMemory,
usedMemory,
usagePercentage,
};
}
async processHtml(html) {
// Process in memory or offload to temp file based on size
const memoryStats = await this.getMemoryStats();
this.logger.debug(`Memory usage: ${memoryStats.usagePercentage.toFixed(1)}%`);
if (html.length > this.config.maxInMemorySize || memoryStats.usagePercentage > 70) {
this.logger.debug('Using temp file processing due to size/memory constraints');
// Write to temporary file and process in chunks
const tempFilePath = path.join(os.tmpdir(), `mcp-fetch-${Date.now()}.html`);
this.tempFiles.push(tempFilePath);
await fs.writeFile(tempFilePath, html);
// Process the file in a memory-efficient way
const fileData = await fs.readFile(tempFilePath, 'utf-8');
const $ = cheerio.load(fileData);
// Remove script and style elements
$('script, style, nav, header, footer').remove();
// Get the text content
let text = $.text();
// Clean up the text
text = text.replace(/\s+/g, ' ').trim();
// Truncate if too long
if (text.length > this.config.maxContentLength) {
text = text.substring(0, this.config.maxContentLength) + '... [content truncated]';
}
// Remove the temp file
try {
await fs.unlink(tempFilePath);
const index = this.tempFiles.indexOf(tempFilePath);
if (index > -1) {
this.tempFiles.splice(index, 1);
}
}
catch {
// File will be cleaned up on exit
this.logger.debug(`Failed to immediately clean temp file: ${tempFilePath}`);
}
return text;
}
else {
this.logger.debug('Using in-memory processing');
// Process in memory
const $ = cheerio.load(html);
// Remove script and style elements
$('script, style, nav, header, footer').remove();
// Get the text content
let text = $.text();
// Clean up the text
text = text.replace(/\s+/g, ' ').trim();
// Truncate if too long
if (text.length > this.config.maxContentLength) {
text = text.substring(0, this.config.maxContentLength) + '... [content truncated]';
}
return text;
}
}
async fetchAndParse(urlStr, ctx) {
this.logger.info(`Fetching content from: ${urlStr}`);
try {
await this.rateLimiter.acquire();
this.logger.debug('Rate limit acquired, making request');
const response = await axios.get(urlStr, {
headers: {
'User-Agent': this.config.userAgent,
},
maxRedirects: this.config.maxRedirects,
timeout: this.config.requestTimeout,
responseType: 'text',
});
this.logger.debug(`Received response with status: ${response.status}, content length: ${response.data.length}`);
const text = await this.processHtml(response.data);
this.logger.info(`Successfully processed content from: ${urlStr}`);
return text;
}
catch (error) {
if (axios.isAxiosError(error) && error.code === 'ECONNABORTED') {
const errorMsg = `Request timed out for URL: ${urlStr}`;
this.logger.error(errorMsg);
await ctx.error(errorMsg);
return `Error: The request timed out while trying to fetch the webpage.`;
}
else if (axios.isAxiosError(error)) {
const errorMsg = `HTTP error occurred while fetching ${urlStr}: ${error.message}`;
this.logger.error(errorMsg);
await ctx.error(errorMsg);
return `Error: Could not access the webpage (${error.message})`;
}
else {
const errorMsg = `Error fetching content from ${urlStr}: ${error.message}`;
this.logger.error(errorMsg, error);
await ctx.error(errorMsg);
return `Error: An unexpected error occurred while fetching the webpage (${error.message})`;
}
}
}
async fetchMultipleUrls(urls, ctx) {
this.logger.info(`Fetching content from ${urls.length} URLs`);
const results = {};
const memoryStats = await this.getMemoryStats();
// Determine batch size based on available memory
let batchSize = 3; // Default
if (memoryStats.usagePercentage > 70) {
batchSize = 1; // Reduce batch size if memory is constrained
this.logger.debug('Reduced batch size due to high memory usage');
}
else if (memoryStats.usagePercentage < 30) {
batchSize = 5; // Increase batch size if plenty of memory
this.logger.debug('Increased batch size due to low memory usage');
}
this.logger.debug(`Processing URLs in batches of ${batchSize}`);
// Process URLs in batches to manage memory
for (let i = 0; i < urls.length; i += batchSize) {
const batch = urls.slice(i, i + batchSize);
this.logger.debug(`Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(urls.length / batchSize)}`);
// Process batch in parallel
const batchResults = await Promise.all(batch.map(async (url) => {
try {
const content = await this.fetchAndParse(url, ctx);
return { url, content };
}
catch (error) {
// Handle errors for individual URLs
const errorMsg = `Error processing URL: ${error.message}`;
this.logger.error(errorMsg, { url });
return {
url,
content: errorMsg,
};
}
}));
// Add batch results to the overall results
for (const { url, content } of batchResults) {
results[url] = content;
}
// Force garbage collection if available (Node with --expose-gc flag)
if (global.gc) {
global.gc();
this.logger.debug('Forced garbage collection');
}
// Small delay between batches to allow system to recover
if (i + batchSize < urls.length) {
await new Promise(resolve => setTimeout(resolve, this.config.batchDelay));
}
}
this.logger.info(`Successfully processed ${Object.keys(results).length} URLs`);
return results;
}
}
// ============================================================================
// MCP WEB CRAWLER SERVER
// ============================================================================
export class WebCrawlerServer {
server;
logger;
searcher;
fetcher;
constructor(config, logger) {
this.logger = logger || new Logger('WebCrawlerServer');
// Initialize crawlers
this.searcher = new DuckDuckGoSearcher(config, this.logger);
this.fetcher = new WebContentFetcher(config, this.logger);
// Create server instance
this.server = new Server({
name: 'mcp-web-crawler',
version: '1.0.0',
}, {
capabilities: {
tools: {},
},
});
this.logger.info('WebCrawlerServer initialized');
this.setupRequestHandlers();
}
/**
* Start the server with the provided transport
*/
startServer(transport) {
this.logger.info('Starting MCP Web Crawler Server');
this.server.connect(transport);
}
/**
* Set up request handlers for the MCP server
*/
setupRequestHandlers() {
// Set tool list handler
this.server.setRequestHandler(ListToolsRequestSchema, async () => {
this.logger.debug('Received ListTools request');
return {
tools: [
{
name: 'web_search',
description: 'Search the web using DuckDuckGo search engine. Returns a structured list of search results with titles, URLs, and snippets. ' +
'Respects rate limits and follows ethical crawling practices. ' +
'Use this tool to find information, research topics, or discover relevant web content.',
inputSchema: zodToJsonSchema(DuckDuckGoWebSearchArgsSchema),
},
{
name: 'fetch_content',
description: 'Fetch and extract clean text content from one or more web pages. Removes scripts, styles, and navigation elements to provide clean, readable content. ' +
'Supports both single URLs (string) and multiple URLs (array). Features memory-efficient processing and rate limiting. ' +
'Use this tool to extract article content, documentation, or any text-based web content.',
inputSchema: zodToJsonSchema(UrlContentExtractorArgsSchema),
},
],
};
});
// Set tool call handler
this.server.setRequestHandler(CallToolRequestSchema, async (request) => {
const { name, arguments: args } = request.params;
this.logger.info(`Received tool call: ${name}`);
try {
// Create context adapter for error reporting
const contextAdapter = {
error: async (message) => {
this.logger.error(`Context error: ${message}`);
},
};
switch (name) {
case 'web_search': {
this.logger.debug('Processing web_search request');
const validatedArgs = DuckDuckGoWebSearchArgsSchema.parse(args);
const results = await this.searcher.search(validatedArgs.query, contextAdapter, validatedArgs.maxResults);
const formattedResults = this.searcher.formatResultsForLLM(results);
return {
content: [{ type: 'text', text: formattedResults }],
isError: false,
};
}
case 'fetch_content': {
this.logger.debug('Processing fetch_content request');
const validatedArgs = UrlContentExtractorArgsSchema.parse(args);
if (typeof validatedArgs.url === 'string') {
this.logger.debug(`Fetching content from single URL: ${validatedArgs.url}`);
const result = await this.fetcher.fetchAndParse(validatedArgs.url, contextAdapter);
return {
content: [{ type: 'text', text: result }],
isError: false,
};
}
else if (Array.isArray(validatedArgs.url)) {
this.logger.debug(`Fetching content from ${validatedArgs.url.length} URLs`);
const results = await this.fetcher.fetchMultipleUrls(validatedArgs.url, contextAdapter);
return {
content: [{ type: 'text', text: JSON.stringify(results, null, 2) }],
isError: false,
};
}
else {
throw new McpError(ErrorCode.InvalidParams, 'Invalid URL format. Expected string or array of strings.');
}
}
default:
this.logger.warn(`Unknown tool requested: ${name}`);
return {
content: [{ type: 'text', text: `Unknown tool: ${name}` }],
isError: true,
};
}
}
catch (error) {
this.logger.error(`Tool execution error: ${error}`, error);
return {
content: [
{
type: 'text',
text: `Error: ${error instanceof Error ? error.message : String(error)}`,
},
],
isError: true,
};
}
});
this.logger.debug('Request handlers configured');
}
}
async function main() {
try {
// Parse command line arguments
const { config, logLevel } = parseArgs();
// Override log level with environment variable if present
let finalLogLevel = logLevel;
if (process.env.MCP_WEB_CRAWLER_LOG_LEVEL) {
const envLogLevel = process.env.MCP_WEB_CRAWLER_LOG_LEVEL.toLowerCase();
switch (envLogLevel) {
case 'error':
finalLogLevel = LogLevel.ERROR;
break;
case 'warn':
finalLogLevel = LogLevel.WARN;
break;
case 'info':
finalLogLevel = LogLevel.INFO;
break;
case 'debug':
finalLogLevel = LogLevel.DEBUG;
break;
}
}
if (process.env.MCP_WEB_CRAWLER_USER_AGENT) {
config.userAgent = process.env.MCP_WEB_CRAWLER_USER_AGENT;
}
// Initialize logger
const logger = new Logger('MCP-WebCrawler', finalLogLevel);
logger.info('Starting MCP Web Crawler Server');
logger.debug('Configuration:', config);
// Create and start server
const server = new WebCrawlerServer(config, logger);
const transport = new StdioServerTransport();
server.startServer(transport);
logger.info('MCP Web Crawler Server started successfully');
}
catch (error) {
console.error('Failed to start MCP Web Crawler Server:', error);
process.exit(1);
}
}
// Handle graceful shutdown
process.on('SIGINT', () => {
console.log('\nReceived SIGINT, shutting down gracefully...');
process.exit(0);
});
process.on('SIGTERM', () => {
console.log('\nReceived SIGTERM, shutting down gracefully...');
process.exit(0);
});
// Start the server
main().catch(error => {
console.error('Unhandled error:', error);
process.exit(1);
});
//# sourceMappingURL=index.js.map