mcp-basic-web-crawler
Version:
A Model Context Protocol (MCP) server providing ethical web crawling and search capabilities
150 lines • 4.59 kB
TypeScript
#!/usr/bin/env node
/**
* MCP Basic Web Crawler Server - Consolidated Single File
*
* A Model Context Protocol server providing basic web crawling and search capabilities.
* This consolidated version contains all functionality in a single file for easy deployment.
*/
import { Transport } from '@modelcontextprotocol/sdk/shared/transport';
import { z } from 'zod';
export interface Context {
error(message: string): Promise<void>;
}
export interface SearchResult {
title: string;
link: string;
snippet: string;
position: number;
}
export interface MemoryStats {
totalMemory: number;
freeMemory: number;
usedMemory: number;
usagePercentage: number;
}
export interface CrawlerConfig {
/** Maximum requests per minute for search operations */
searchRateLimit: number;
/** Maximum requests per minute for content fetching */
fetchRateLimit: number;
/** Maximum content size to process in memory (bytes) */
maxInMemorySize: number;
/** Request timeout in milliseconds */
requestTimeout: number;
/** Maximum number of redirects to follow */
maxRedirects: number;
/** Maximum content length to return (characters) */
maxContentLength: number;
/** User agent string for requests */
userAgent: string;
/** Whether to respect robots.txt */
respectRobots: boolean;
/** Delay between batch processing (milliseconds) */
batchDelay: number;
}
export interface RobotsTxtRules {
allowed: boolean;
crawlDelay?: number;
}
export declare const DEFAULT_CONFIG: CrawlerConfig;
export declare const DuckDuckGoWebSearchArgsSchema: z.ZodObject<{
query: z.ZodString;
maxResults: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
}, "strip", z.ZodTypeAny, {
query: string;
maxResults: number;
}, {
query: string;
maxResults?: number | undefined;
}>;
export declare const UrlContentExtractorArgsSchema: z.ZodObject<{
url: z.ZodUnion<[z.ZodString, z.ZodArray<z.ZodString, "many">]>;
}, "strip", z.ZodTypeAny, {
url: string | string[];
}, {
url: string | string[];
}>;
export type DuckDuckGoWebSearchArgs = z.infer<typeof DuckDuckGoWebSearchArgsSchema>;
export type UrlContentExtractorArgs = z.infer<typeof UrlContentExtractorArgsSchema>;
export declare enum LogLevel {
ERROR = 0,
WARN = 1,
INFO = 2,
DEBUG = 3
}
export declare class Logger {
private level;
private prefix;
constructor(prefix?: string, level?: LogLevel);
private log;
error(message: string, ...args: any[]): void;
warn(message: string, ...args: any[]): void;
info(message: string, ...args: any[]): void;
debug(message: string, ...args: any[]): void;
setLevel(level: LogLevel): void;
child(suffix: string): Logger;
}
export declare class RateLimiter {
private requestsPerMinute;
private requests;
constructor(requestsPerMinute?: number);
/**
* Acquire permission to make a request, waiting if necessary
*/
acquire(): Promise<void>;
/**
* Get current rate limit status
*/
getStatus(): {
current: number;
limit: number;
resetTime: Date | null;
};
/**
* Update the rate limit
*/
updateLimit(newLimit: number): void;
}
export declare class DuckDuckGoSearcher {
private static readonly BASE_URL;
private rateLimiter;
private logger;
private config;
constructor(config: CrawlerConfig, logger: Logger);
/**
* Format search results for LLM consumption
*/
formatResultsForLLM(results: SearchResult[]): string;
/**
* Perform a search query with rate limiting and error handling
*/
search(query: string, ctx: Context, maxResults?: number): Promise<SearchResult[]>;
}
export declare class WebContentFetcher {
private rateLimiter;
private tempFiles;
private logger;
private config;
constructor(config: CrawlerConfig, logger: Logger);
private cleanup;
private getMemoryStats;
private processHtml;
fetchAndParse(urlStr: string, ctx: Context): Promise<string>;
fetchMultipleUrls(urls: string[], ctx: Context): Promise<Record<string, string>>;
}
export declare class WebCrawlerServer {
private server;
private logger;
private searcher;
private fetcher;
constructor(config: CrawlerConfig, logger?: Logger);
/**
* Start the server with the provided transport
*/
startServer(transport: Transport): void;
/**
* Set up request handlers for the MCP server
*/
private setupRequestHandlers;
}
//# sourceMappingURL=index.d.ts.map