UNPKG

@aipmanager/search-mcp

Version:

MCP server providing Cursor-like Search tools: read file, list dir, grep, search files, semantic code search, web search

297 lines (296 loc) 14 kB
#!/usr/bin/env node import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { z } from "zod"; import fs from "fs/promises"; import path from "path"; import fg from "fast-glob"; import OpenAI from "openai"; import { MilvusClient, DataType, MetricType } from "@zilliz/milvus2-sdk-node"; import { Pinecone } from "@pinecone-database/pinecone"; // ---------------- Env & Helpers ---------------- const ENV = { VECTOR_PROVIDER: process.env.INDEXER_VECTOR_PROVIDER || process.env.SEARCH_VECTOR_PROVIDER || "milvus", // Milvus MILVUS_ADDRESS: process.env.MILVUS_ADDRESS || "localhost:19530", MILVUS_USERNAME: process.env.MILVUS_USERNAME || undefined, MILVUS_PASSWORD: process.env.MILVUS_PASSWORD || undefined, MILVUS_COLLECTION: process.env.MILVUS_COLLECTION || "code_chunks", MILVUS_VECTOR_FIELD: process.env.MILVUS_VECTOR_FIELD || "vector", MILVUS_METRIC: (process.env.MILVUS_METRIC || "IP"), // Pinecone PINECONE_API_KEY: process.env.PINECONE_API_KEY || undefined, PINECONE_INDEX: process.env.PINECONE_INDEX || undefined, PINECONE_NAMESPACE: process.env.PINECONE_NAMESPACE || "default", // Embedding OPENAI_API_KEY: process.env.OPENAI_API_KEY || undefined, SILICONFLOW_API_KEY: process.env.SILICONFLOW_API_KEY || undefined, EMBED_PROVIDER: process.env.SEARCH_EMBED_PROVIDER || process.env.INDEXER_EMBED_PROVIDER || undefined, EMBED_BASE_URL: process.env.SEARCH_EMBED_BASE_URL || process.env.INDEXER_EMBED_BASE_URL || undefined, EMBED_MODEL: process.env.SEARCH_EMBED_MODEL || process.env.INDEXER_EMBED_MODEL || "text-embedding-3-small", // Web Search (Google CSE) GOOGLE_API_KEY: process.env.GOOGLE_API_KEY || undefined, GOOGLE_CSE_ID: process.env.GOOGLE_CSE_ID || undefined, }; function parseBaseDirFromArgv() { const argv = process.argv.slice(2); for (let i = 0; i < argv.length; i++) { const arg = argv[i]; if (arg.startsWith("--baseDir=")) return arg.split("=")[1]; if (arg.startsWith("--base-dir=")) return arg.split("=")[1]; if (arg === "--baseDir" || arg === "--base-dir") return argv[i + 1]; } return undefined; } const BASE_DIR = (() => { const candidate = parseBaseDirFromArgv() || process.cwd(); return path.isAbsolute(candidate) ? path.normalize(candidate) : path.normalize(path.join(process.cwd(), candidate)); })(); function assertWithinBaseDir(absPath) { const rel = path.relative(BASE_DIR, absPath); if (rel.startsWith("..") || path.isAbsolute(rel)) { throw new Error(`访问越界:路径不在 baseDir 范围内 (${BASE_DIR})`); } } function toAbsolute(p) { if (!p) return p; const abs = path.isAbsolute(p) ? path.normalize(p) : path.normalize(path.join(BASE_DIR, p)); assertWithinBaseDir(abs); return abs; } async function readTextFileLimited(filePath, offset, limit) { const abs = toAbsolute(filePath); const content = await fs.readFile(abs, "utf8"); const lines = content.split(/\r?\n/); const start = Math.max(0, Math.min(offset ?? 0, lines.length)); const end = Math.max(start, Math.min(start + (limit ?? lines.length), lines.length)); const sliced = lines.slice(start, end).join("\n"); return { text: sliced, totalLines: lines.length }; } async function grepInFiles(baseDir, pattern, opts) { const include = opts.include && opts.include.length > 0 ? opts.include : ["**/*"]; const exclude = opts.exclude || ["**/node_modules/**", "**/.git/**", "**/dist/**", "**/build/**"]; const files = await fg(include, { cwd: baseDir, ignore: exclude, dot: false, onlyFiles: true, unique: true }); const flags = `${opts.caseInsensitive ? "i" : ""}${opts.multiline ? "m" : ""}`; const regex = new RegExp(pattern, flags); const results = []; const headLimit = Math.max(1, Math.min(opts.headLimit ?? 200, 2000)); const beforeN = Math.max(0, Math.min(opts.contextBefore ?? 0, 20)); const afterN = Math.max(0, Math.min(opts.contextAfter ?? 0, 20)); for (const rel of files) { const abs = path.join(baseDir, rel); let text; try { text = await fs.readFile(abs, "utf8"); } catch { continue; } const lines = text.split(/\r?\n/); for (let i = 0; i < lines.length; i++) { const lineText = lines[i]; if (regex.test(lineText)) { const before = beforeN > 0 ? lines.slice(Math.max(0, i - beforeN), i) : undefined; const after = afterN > 0 ? lines.slice(i + 1, Math.min(lines.length, i + 1 + afterN)) : undefined; const relPath = path.relative(BASE_DIR, abs); results.push({ file: relPath, line: i + 1, match: lineText, before, after }); if (results.length >= headLimit) return results; } } } return results; } async function listDirSafe(dir) { const abs = toAbsolute(dir); const entries = await fs.readdir(abs, { withFileTypes: true }); return entries.map((e) => ({ name: e.name, type: e.isDirectory() ? "dir" : e.isFile() ? "file" : "other" })); } async function fuzzySearchFiles(baseDir, query, limit = 200) { const files = await fg(["**/*"], { cwd: baseDir, ignore: ["**/node_modules/**", "**/.git/**", "**/dist/**", "**/build/**"], onlyFiles: true, dot: false }); const q = query.toLowerCase(); const matched = files.filter((p) => p.toLowerCase().includes(q)).slice(0, limit); // 转为全局 baseDir 相对路径 return matched.map((p) => path.relative(BASE_DIR, path.join(baseDir, p))); } // Embedding utility async function embedTexts(texts) { // provider: openai | siliconflow | custom const provider = (ENV.EMBED_PROVIDER || (ENV.SILICONFLOW_API_KEY ? 'siliconflow' : 'openai')).toLowerCase(); let apiKey; let baseURL = ENV.EMBED_BASE_URL; if (provider === 'siliconflow') { apiKey = ENV.SILICONFLOW_API_KEY; baseURL = baseURL || 'https://api.siliconflow.cn/v1'; } else if (provider === 'custom') { apiKey = ENV.OPENAI_API_KEY || ENV.SILICONFLOW_API_KEY; // 允许自定义时任选其一 if (!baseURL) throw new Error('SEARCH_EMBED_BASE_URL is required for custom provider'); } else { // default openai apiKey = ENV.OPENAI_API_KEY; } if (!apiKey) throw new Error('Embedding provider API key is missing'); const client = new OpenAI({ apiKey, ...(baseURL ? { baseURL } : {}) }); const res = await client.embeddings.create({ model: ENV.EMBED_MODEL, input: texts }); return res.data.map((d) => d.embedding); } async function semanticSearch(query, topK = 8) { const [vec] = await embedTexts([query]); if ((ENV.VECTOR_PROVIDER || "milvus").toLowerCase() === "pinecone") { if (!ENV.PINECONE_API_KEY || !ENV.PINECONE_INDEX) throw new Error("Pinecone env PINECONE_API_KEY/PINECONE_INDEX required"); const pc = new Pinecone({ apiKey: ENV.PINECONE_API_KEY }); const index = pc.index(ENV.PINECONE_INDEX).namespace(ENV.PINECONE_NAMESPACE || "default"); const q = await index.query({ topK, vector: vec, includeMetadata: true }); return (q.matches || []).map((m) => ({ id: m.id, score: m.score, metadata: m.metadata })); } // default milvus const client = new MilvusClient({ address: ENV.MILVUS_ADDRESS, username: ENV.MILVUS_USERNAME, password: ENV.MILVUS_PASSWORD, ssl: false }); try { await client.loadCollectionSync({ collection_name: ENV.MILVUS_COLLECTION }); } catch { } const annsField = ENV.MILVUS_VECTOR_FIELD || 'vector'; const res = await client.search({ collection_name: ENV.MILVUS_COLLECTION, vectors: [vec], vector_type: DataType.FloatVector, anns_field: annsField, topk: topK, metric_type: MetricType[ENV.MILVUS_METRIC] || MetricType.IP, params: { nprobe: 10 }, output_fields: ["id", "repo", "path", "url", "commit", "meta_json"], }); const out = []; for (const hit of res.results || res.results_fields || []) { const fields = hit.fields || hit; out.push({ id: fields.id, score: hit.score || fields.distance || 0, repo: fields.repo, path: fields.path, url: fields.url, commit: fields.commit, meta: fields.meta_json ? JSON.parse(fields.meta_json) : undefined, }); } return out; } async function googleCseSearch(query, num = 5) { if (!ENV.GOOGLE_API_KEY || !ENV.GOOGLE_CSE_ID) { return { warning: "未配置 GOOGLE_API_KEY/GOOGLE_CSE_ID,返回占位结果。", results: [] }; } const url = new URL("https://www.googleapis.com/customsearch/v1"); url.searchParams.set("key", ENV.GOOGLE_API_KEY); url.searchParams.set("cx", ENV.GOOGLE_CSE_ID); url.searchParams.set("q", query); url.searchParams.set("num", String(Math.max(1, Math.min(num, 10)))); const res = await fetch(url.toString()); const data = await res.json(); const items = (data.items || []).map((it) => ({ title: it.title, link: it.link, snippet: it.snippet })); return { results: items }; } // ---------------- MCP Server ---------------- const server = new McpServer({ name: "aipm-search", version: "1.0.0", description: "Search MCP: read/list, grep, search files, semantic code search, web search" }); // Read File server.registerTool("readFile", { title: "读取文件(可限制行数)", description: "读取指定文件内容,可通过 offset 与 limit 控制返回行范围。", inputSchema: { path: z.string().describe("文件绝对或相对路径"), offset: z.number().int().optional().describe("起始行,从0开始,可选"), limit: z.number().int().optional().describe("返回的最大行数,可选") }, }, async ({ path: filePath, offset, limit }) => { const { text, totalLines } = await readTextFileLimited(filePath, offset, limit); return { content: [{ type: "text", text }], meta: { totalLines } }; }); // List Directory server.registerTool("listDirectory", { title: "列出目录", description: "列出目录内容(文件/文件夹)", inputSchema: { path: z.string().describe("目录路径") }, }, async ({ path: dir }) => { const entries = await listDirSafe(dir); return { content: [{ type: "text", text: JSON.stringify(entries, null, 2) }] }; }); // Grep server.registerTool("grepSearch", { title: "Grep 搜索", description: "在代码中使用正则查找匹配的行,支持大小写与多行模式、上下文和数量限制。", inputSchema: { baseDir: z.string().optional().describe("搜索起始目录,默认当前工作目录"), pattern: z.string().describe("正则表达式"), include: z.array(z.string()).optional().describe("包含的glob列表,默认 **/*"), exclude: z.array(z.string()).optional().describe("排除的glob列表"), caseInsensitive: z.boolean().optional(), multiline: z.boolean().optional(), contextBefore: z.number().int().optional(), contextAfter: z.number().int().optional(), headLimit: z.number().int().optional(), }, }, async ({ baseDir, pattern, include, exclude, caseInsensitive, multiline, contextBefore, contextAfter, headLimit }) => { const dir = toAbsolute(baseDir || "."); const hits = await grepInFiles(dir, pattern, { include, exclude, caseInsensitive, multiline, contextBefore, contextAfter, headLimit }); return { content: [{ type: "text", text: JSON.stringify(hits, null, 2) }] }; }); // Search Files (fuzzy by name) server.registerTool("searchFiles", { title: "按文件名模糊搜索", description: "基于文件名的模糊匹配,返回相对权重的前若干个文件路径。", inputSchema: { baseDir: z.string().optional().describe("起始目录,默认当前工作目录"), query: z.string().describe("查询词,将执行文件名包含匹配"), limit: z.number().int().optional().describe("返回数量上限,默认200") }, }, async ({ baseDir, query, limit }) => { const dir = toAbsolute(baseDir || "."); const files = await fuzzySearchFiles(dir, query, limit); return { content: [{ type: "text", text: JSON.stringify(files, null, 2) }] }; }); // Codebase semantic search server.registerTool("codebaseSearch", { title: "语义搜索代码库", description: "基于向量数据库(Milvus/Pinecone)的代码片段语义检索", inputSchema: { query: z.string().describe("检索语句"), topK: z.number().int().optional().describe("返回条数,默认为8") }, }, async ({ query, topK }) => { const results = await semanticSearch(query, topK); return { content: [{ type: "text", text: JSON.stringify(results, null, 2) }] }; }); // Web search via Google CSE if configured server.registerTool("webSearch", { title: "Web 搜索", description: "调用 Google CSE(需配置 GOOGLE_API_KEY/GOOGLE_CSE_ID),否则返回占位结果。", inputSchema: { query: z.string().describe("检索语句"), num: z.number().int().optional().describe("返回条数,1-10,默认5") }, }, async ({ query, num }) => { const data = await googleCseSearch(query, num); return { content: [{ type: "text", text: JSON.stringify(data, null, 2) }] }; }); async function main() { try { const transport = new StdioServerTransport(); await server.connect(transport); // eslint-disable-next-line no-console console.log("[SearchMCP] ready on stdio"); } catch (err) { // eslint-disable-next-line no-console console.error("[SearchMCP] failed to start:", err); process.exit(1); } } main();