UNPKG

@siddhantxh/code-extractor-mcp

Version:

MCP server for extracting code from GitHub repositories and Google Colab notebooks

501 lines (437 loc) 16.6 kB
#!/usr/bin/env node import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { z } from "zod"; import { Octokit } from 'octokit'; import { Base64 } from 'js-base64'; import ignore from 'ignore'; // --- Core Logic --- const octokit = new Octokit({ auth: process.env.GITHUB_TOKEN, }); function estimateTokens(text) { return Math.ceil(text.length / 4); } function generateFileTree(paths) { const root = {}; paths.forEach(path => { path.split('/').reduce((acc, name) => { acc[name] = acc[name] || {}; return acc[name]; }, root); }); const createTree = (node, prefix = '') => { const entries = Object.entries(node); let treeString = ''; entries.forEach(([name, newNode], index) => { const isLast = index === entries.length - 1; treeString += `${prefix}${isLast ? '└── ' : '├── '}${name}\n`; treeString += createTree(newNode, `${prefix}${isLast ? ' ' : '│ '}`); }); return treeString; }; return createTree(root); } function isBinaryOrUnwanted(path) { const extension = path.split('.').pop()?.toLowerCase(); const unwantedExtensions = [ 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'ico', 'svg', 'webp', 'mp4', 'mov', 'avi', 'mkv', 'mp3', 'wav', 'ogg', 'zip', 'rar', 'tar', 'gz', 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'lock', 'sum' ]; return extension ? unwantedExtensions.includes(extension) : false; } function parseGitHubUrl(url) { const match = url.match(/github\.com\/([^/]+)\/([^/]+)(?:\/(?:blob|tree)\/([^/]+)\/?(.*))?/); if (!match) return null; const owner = match[1]; const repo = match[2]; const treeSha = match[3] || 'HEAD'; const file = match[4]; return { owner, repo, file, treeSha }; } async function getRepoTree(owner, repo, treeSha, excludePatterns = '', sizeLimitKB = 64) { try { const { data: treeData } = await octokit.rest.git.getTree({ owner, repo, tree_sha: treeSha, recursive: '1', }); if (!treeData.tree) { throw new Error('Could not fetch repository tree. The repository might be empty.'); } // Handle .gitignore let ig = ignore(); const gitignorePath = treeData.tree.find(file => file.path === '.gitignore'); if (gitignorePath && gitignorePath.sha) { const { data: blob } = await octokit.rest.git.getBlob({ owner, repo, file_sha: gitignorePath.sha, }); const gitignoreContent = Base64.decode(blob.content); ig.add(gitignoreContent); } // User-defined excludes const userExcludePatterns = excludePatterns.split(',').map(p => p.trim()).filter(Boolean); ig.add(userExcludePatterns); const filteredFiles = treeData.tree.filter( (item) => item.type === 'blob' && item.path && item.size && item.size <= sizeLimitKB * 1024 && !isBinaryOrUnwanted(item.path) && !ig.ignores(item.path) ); return { allFiles: treeData.tree, filteredFiles, paths: filteredFiles.map(f => f.path) }; } catch (error) { throw new Error(`Failed to fetch repository tree: ${error.message}`); } } async function getFileContent(owner, repo, fileSha) { try { const { data: blob } = await octokit.rest.git.getBlob({ owner, repo, file_sha: fileSha, }); return Base64.decode(blob.content); } catch (error) { throw new Error(`Failed to fetch file content: ${error.message}`); } } async function getNotebookContent(url) { try { const response = await fetch(url); if (!response.ok) { throw new Error(`Failed to fetch notebook content. Status: ${response.status}`); } const notebook = await response.json(); let output = 'Jupyter Notebook Content:\n\n'; notebook.cells.forEach((cell, index) => { const cellType = cell.cell_type.toUpperCase(); const source = cell.source.join(''); output += `--- CELL ${index + 1} (${cellType}) ---\n`; output += `${source}\n\n`; }); return output; } catch (error) { throw new Error(`Could not parse notebook content: ${error.message}`); } } // --- MCP Server Setup --- const server = new McpServer({ name: "github-extractor", version: "1.0.0", instructions: "Extracts code and content from GitHub repositories and Google Colab notebooks for LLM ingestion" }); // Tool 1: Get repository/notebook file tree server.tool("tree", { url: z.string().describe("GitHub repository URL or Google Colab notebook URL"), exclude: z.string().optional().default("").describe("Comma-separated glob patterns to exclude"), sizeLimitKB: z.number().optional().default(64).describe("Maximum file size in KB to include") }, async ({ url, exclude, sizeLimitKB }) => { try { // Handle Colab URLs if (url.includes('colab.research.google.com')) { const match = url.match(/colab\.research\.google\.com\/github\/([^/]+)\/([^/]+)\/blob\/([^/]+)\/(.+)/); if (match) { const [_, owner, repo, branch, file] = match; return { content: [{ type: "text", text: `Colab Notebook File Tree:\n└── ${file}\n\nNote: This is a single notebook file.` }] }; } throw new Error("Could not parse Colab URL"); } // Handle GitHub URLs const githubParts = parseGitHubUrl(url); if (!githubParts) { throw new Error("Invalid GitHub URL format"); } const { owner, repo, treeSha } = githubParts; const { paths } = await getRepoTree(owner, repo, treeSha, exclude, sizeLimitKB); const fileTree = generateFileTree(paths); return { content: [{ type: "text", text: `Repository: ${owner}/${repo}\nBranch/Commit: ${treeSha}\n\nFile Tree:\n${fileTree}\n\nTotal files: ${paths.length}` }] }; } catch (error) { return { content: [{ type: "text", text: `Error: ${error.message}` }] }; } } ); // Tool 2: Fetch all content server.tool("fetchAllContent", { url: z.string().describe("GitHub repository URL or Google Colab notebook URL"), exclude: z.string().optional().default("").describe("Comma-separated glob patterns to exclude"), sizeLimitKB: z.number().optional().default(64).describe("Maximum file size in KB to include") }, async ({ url, exclude, sizeLimitKB }) => { try { // Handle Colab URLs if (url.includes('colab.research.google.com')) { const match = url.match(/colab\.research\.google\.com\/github\/([^/]+)\/([^/]+)\/blob\/([^/]+)\/(.+)/); if (match) { const [_, owner, repo, branch, file] = match; const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file}`; const content = await getNotebookContent(rawUrl); const tokenCount = estimateTokens(content); return { content: [{ type: "text", text: `${content}\n\n--- STATS ---\nFile: ${file}\nToken Count: ${tokenCount}\nSize: ${Buffer.byteLength(content, 'utf8')} bytes` }] }; } throw new Error("Could not parse Colab URL"); } // Handle GitHub URLs const githubParts = parseGitHubUrl(url); if (!githubParts) { throw new Error("Invalid GitHub URL format"); } const { owner, repo, treeSha, file } = githubParts; // If it's a specific notebook file if (file && file.endsWith('.ipynb')) { const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${treeSha}/${file}`; const content = await getNotebookContent(rawUrl); const tokenCount = estimateTokens(content); return { content: [{ type: "text", text: `${content}\n\n--- STATS ---\nFile: ${file}\nToken Count: ${tokenCount}\nSize: ${Buffer.byteLength(content, 'utf8')} bytes` }] }; } // Full repository extraction const { filteredFiles, paths } = await getRepoTree(owner, repo, treeSha, exclude, sizeLimitKB); const fileContents = await Promise.all( filteredFiles.map(async (file) => { const content = await getFileContent(owner, repo, file.sha); return { path: file.path, content }; }) ); const fileTree = generateFileTree(paths); let output = `Repository: ${owner}/${repo}\nBranch/Commit: ${treeSha}\n\n`; output += `File Tree:\n${fileTree}\n\n`; fileContents.forEach((file) => { output += `--- FILE: ${file.path} ---\n`; output += `${file.content}\n\n`; }); const totalSizeKB = fileContents.reduce((acc, file) => acc + Buffer.byteLength(file.content, 'utf8'), 0) / 1024; const tokenCount = estimateTokens(output); output += `--- STATS ---\nFiles: ${fileContents.length}\nTotal Size: ${totalSizeKB.toFixed(2)} KB\nToken Count: ${tokenCount}`; return { content: [{ type: "text", text: output }] }; } catch (error) { return { content: [{ type: "text", text: `Error: ${error.message}` }] }; } } ); // Tool 3: Fetch all content but exclude specific file extensions server.tool("fetchAllContentButExclude", { url: z.string().describe("GitHub repository URL or Google Colab notebook URL"), excludeExtensions: z.string().describe("Comma-separated file extensions to exclude (e.g., 'test.js,spec.ts,.md')"), exclude: z.string().optional().default("").describe("Additional comma-separated glob patterns to exclude"), sizeLimitKB: z.number().optional().default(64).describe("Maximum file size in KB to include") }, async ({ url, excludeExtensions, exclude, sizeLimitKB }) => { try { // Convert extensions to glob patterns const extensionPatterns = excludeExtensions.split(',') .map(ext => ext.trim()) .filter(Boolean) .map(ext => ext.startsWith('.') ? `**/*${ext}` : `**/*.${ext}`) .join(','); const combinedExclude = [exclude, extensionPatterns].filter(Boolean).join(','); // Handle Colab URLs if (url.includes('colab.research.google.com')) { const match = url.match(/colab\.research\.google\.com\/github\/([^/]+)\/([^/]+)\/blob\/([^/]+)\/(.+)/); if (match) { const [_, owner, repo, branch, file] = match; const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file}`; const content = await getNotebookContent(rawUrl); const tokenCount = estimateTokens(content); return { content: [{ type: "text", text: `${content}\n\n--- STATS ---\nFile: ${file}\nToken Count: ${tokenCount}\nSize: ${Buffer.byteLength(content, 'utf8')} bytes` }] }; } throw new Error("Could not parse Colab URL"); } // Handle GitHub URLs const githubParts = parseGitHubUrl(url); if (!githubParts) { throw new Error("Invalid GitHub URL format"); } const { owner, repo, treeSha, file } = githubParts; // If it's a specific notebook file if (file && file.endsWith('.ipynb')) { const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${treeSha}/${file}`; const content = await getNotebookContent(rawUrl); const tokenCount = estimateTokens(content); return { content: [{ type: "text", text: `${content}\n\n--- STATS ---\nFile: ${file}\nToken Count: ${tokenCount}\nSize: ${Buffer.byteLength(content, 'utf8')} bytes` }] }; } // Full repository extraction with enhanced exclusion const { filteredFiles, paths } = await getRepoTree(owner, repo, treeSha, combinedExclude, sizeLimitKB); const fileContents = await Promise.all( filteredFiles.map(async (file) => { const content = await getFileContent(owner, repo, file.sha); return { path: file.path, content }; }) ); const fileTree = generateFileTree(paths); let output = `Repository: ${owner}/${repo}\nBranch/Commit: ${treeSha}\n\n`; output += `File Tree:\n${fileTree}\n\n`; fileContents.forEach((file) => { output += `--- FILE: ${file.path} ---\n`; output += `${file.content}\n\n`; }); const totalSizeKB = fileContents.reduce((acc, file) => acc + Buffer.byteLength(file.content, 'utf8'), 0) / 1024; const tokenCount = estimateTokens(output); output += `--- STATS ---\nFiles: ${fileContents.length}\nTotal Size: ${totalSizeKB.toFixed(2)} KB\nToken Count: ${tokenCount}`; return { content: [{ type: "text", text: output }] }; } catch (error) { return { content: [{ type: "text", text: `Error: ${error.message}` }] }; } } ); // Tool 4: Fetch specific file content server.tool("specificContent", { url: z.string().describe("GitHub repository URL or Google Colab notebook URL"), filePath: z.string().describe("Specific file path to fetch (for repos) or cell index (for notebooks, e.g., 'cell-3')") }, async ({ url, filePath }) => { try { // Handle Colab URLs if (url.includes('colab.research.google.com')) { const match = url.match(/colab\.research\.google\.com\/github\/([^/]+)\/([^/]+)\/blob\/([^/]+)\/(.+)/); if (match) { const [_, owner, repo, branch, file] = match; const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file}`; const response = await fetch(rawUrl); if (!response.ok) { throw new Error(`Failed to fetch notebook. Status: ${response.status}`); } const notebook = await response.json(); // Handle cell-specific requests if (filePath.startsWith('cell-')) { const cellIndex = parseInt(filePath.replace('cell-', '')) - 1; if (cellIndex < 0 || cellIndex >= notebook.cells.length) { throw new Error(`Cell ${cellIndex + 1} not found. Notebook has ${notebook.cells.length} cells.`); } const cell = notebook.cells[cellIndex]; const cellType = cell.cell_type.toUpperCase(); const source = cell.source.join(''); return { content: [{ type: "text", text: `--- CELL ${cellIndex + 1} (${cellType}) ---\n${source}` }] }; } else { // Return full notebook if no specific cell requested const content = await getNotebookContent(rawUrl); return { content: [{ type: "text", text: content }] }; } } throw new Error("Could not parse Colab URL"); } // Handle GitHub URLs const githubParts = parseGitHubUrl(url); if (!githubParts) { throw new Error("Invalid GitHub URL format"); } const { owner, repo, treeSha } = githubParts; // Get the specific file const { data: treeData } = await octokit.rest.git.getTree({ owner, repo, tree_sha: treeSha, recursive: '1', }); const targetFile = treeData.tree.find(file => file.path === filePath && file.type === 'blob'); if (!targetFile) { throw new Error(`File '${filePath}' not found in repository`); } const content = await getFileContent(owner, repo, targetFile.sha); const tokenCount = estimateTokens(content); return { content: [{ type: "text", text: `--- FILE: ${filePath} ---\n${content}\n\n--- STATS ---\nToken Count: ${tokenCount}\nSize: ${Buffer.byteLength(content, 'utf8')} bytes` }] }; } catch (error) { return { content: [{ type: "text", text: `Error: ${error.message}` }] }; } } ); const transport = new StdioServerTransport(); await server.connect(transport);