@siddhantxh/code-extractor-mcp
Version:
MCP server for extracting code from GitHub repositories and Google Colab notebooks
501 lines (437 loc) • 16.6 kB
JavaScript
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import { z } from "zod";
import { Octokit } from 'octokit';
import { Base64 } from 'js-base64';
import ignore from 'ignore';
// --- Core Logic ---
const octokit = new Octokit({
auth: process.env.GITHUB_TOKEN,
});
function estimateTokens(text) {
return Math.ceil(text.length / 4);
}
function generateFileTree(paths) {
const root = {};
paths.forEach(path => {
path.split('/').reduce((acc, name) => {
acc[name] = acc[name] || {};
return acc[name];
}, root);
});
const createTree = (node, prefix = '') => {
const entries = Object.entries(node);
let treeString = '';
entries.forEach(([name, newNode], index) => {
const isLast = index === entries.length - 1;
treeString += `${prefix}${isLast ? '└── ' : '├── '}${name}\n`;
treeString += createTree(newNode, `${prefix}${isLast ? ' ' : '│ '}`);
});
return treeString;
};
return createTree(root);
}
function isBinaryOrUnwanted(path) {
const extension = path.split('.').pop()?.toLowerCase();
const unwantedExtensions = [
'png', 'jpg', 'jpeg', 'gif', 'bmp', 'ico', 'svg', 'webp',
'mp4', 'mov', 'avi', 'mkv',
'mp3', 'wav', 'ogg',
'zip', 'rar', 'tar', 'gz',
'pdf', 'doc', 'docx', 'xls', 'xlsx',
'lock', 'sum'
];
return extension ? unwantedExtensions.includes(extension) : false;
}
function parseGitHubUrl(url) {
const match = url.match(/github\.com\/([^/]+)\/([^/]+)(?:\/(?:blob|tree)\/([^/]+)\/?(.*))?/);
if (!match) return null;
const owner = match[1];
const repo = match[2];
const treeSha = match[3] || 'HEAD';
const file = match[4];
return { owner, repo, file, treeSha };
}
async function getRepoTree(owner, repo, treeSha, excludePatterns = '', sizeLimitKB = 64) {
try {
const { data: treeData } = await octokit.rest.git.getTree({
owner,
repo,
tree_sha: treeSha,
recursive: '1',
});
if (!treeData.tree) {
throw new Error('Could not fetch repository tree. The repository might be empty.');
}
// Handle .gitignore
let ig = ignore();
const gitignorePath = treeData.tree.find(file => file.path === '.gitignore');
if (gitignorePath && gitignorePath.sha) {
const { data: blob } = await octokit.rest.git.getBlob({
owner,
repo,
file_sha: gitignorePath.sha,
});
const gitignoreContent = Base64.decode(blob.content);
ig.add(gitignoreContent);
}
// User-defined excludes
const userExcludePatterns = excludePatterns.split(',').map(p => p.trim()).filter(Boolean);
ig.add(userExcludePatterns);
const filteredFiles = treeData.tree.filter(
(item) =>
item.type === 'blob' &&
item.path &&
item.size &&
item.size <= sizeLimitKB * 1024 &&
!isBinaryOrUnwanted(item.path) &&
!ig.ignores(item.path)
);
return {
allFiles: treeData.tree,
filteredFiles,
paths: filteredFiles.map(f => f.path)
};
} catch (error) {
throw new Error(`Failed to fetch repository tree: ${error.message}`);
}
}
async function getFileContent(owner, repo, fileSha) {
try {
const { data: blob } = await octokit.rest.git.getBlob({
owner,
repo,
file_sha: fileSha,
});
return Base64.decode(blob.content);
} catch (error) {
throw new Error(`Failed to fetch file content: ${error.message}`);
}
}
async function getNotebookContent(url) {
try {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to fetch notebook content. Status: ${response.status}`);
}
const notebook = await response.json();
let output = 'Jupyter Notebook Content:\n\n';
notebook.cells.forEach((cell, index) => {
const cellType = cell.cell_type.toUpperCase();
const source = cell.source.join('');
output += `--- CELL ${index + 1} (${cellType}) ---\n`;
output += `${source}\n\n`;
});
return output;
} catch (error) {
throw new Error(`Could not parse notebook content: ${error.message}`);
}
}
// --- MCP Server Setup ---
const server = new McpServer({
name: "github-extractor",
version: "1.0.0",
instructions: "Extracts code and content from GitHub repositories and Google Colab notebooks for LLM ingestion"
});
// Tool 1: Get repository/notebook file tree
server.tool("tree",
{
url: z.string().describe("GitHub repository URL or Google Colab notebook URL"),
exclude: z.string().optional().default("").describe("Comma-separated glob patterns to exclude"),
sizeLimitKB: z.number().optional().default(64).describe("Maximum file size in KB to include")
},
async ({ url, exclude, sizeLimitKB }) => {
try {
// Handle Colab URLs
if (url.includes('colab.research.google.com')) {
const match = url.match(/colab\.research\.google\.com\/github\/([^/]+)\/([^/]+)\/blob\/([^/]+)\/(.+)/);
if (match) {
const [_, owner, repo, branch, file] = match;
return {
content: [{
type: "text",
text: `Colab Notebook File Tree:\n└── ${file}\n\nNote: This is a single notebook file.`
}]
};
}
throw new Error("Could not parse Colab URL");
}
// Handle GitHub URLs
const githubParts = parseGitHubUrl(url);
if (!githubParts) {
throw new Error("Invalid GitHub URL format");
}
const { owner, repo, treeSha } = githubParts;
const { paths } = await getRepoTree(owner, repo, treeSha, exclude, sizeLimitKB);
const fileTree = generateFileTree(paths);
return {
content: [{
type: "text",
text: `Repository: ${owner}/${repo}\nBranch/Commit: ${treeSha}\n\nFile Tree:\n${fileTree}\n\nTotal files: ${paths.length}`
}]
};
} catch (error) {
return {
content: [{
type: "text",
text: `Error: ${error.message}`
}]
};
}
}
);
// Tool 2: Fetch all content
server.tool("fetchAllContent",
{
url: z.string().describe("GitHub repository URL or Google Colab notebook URL"),
exclude: z.string().optional().default("").describe("Comma-separated glob patterns to exclude"),
sizeLimitKB: z.number().optional().default(64).describe("Maximum file size in KB to include")
},
async ({ url, exclude, sizeLimitKB }) => {
try {
// Handle Colab URLs
if (url.includes('colab.research.google.com')) {
const match = url.match(/colab\.research\.google\.com\/github\/([^/]+)\/([^/]+)\/blob\/([^/]+)\/(.+)/);
if (match) {
const [_, owner, repo, branch, file] = match;
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file}`;
const content = await getNotebookContent(rawUrl);
const tokenCount = estimateTokens(content);
return {
content: [{
type: "text",
text: `${content}\n\n--- STATS ---\nFile: ${file}\nToken Count: ${tokenCount}\nSize: ${Buffer.byteLength(content, 'utf8')} bytes`
}]
};
}
throw new Error("Could not parse Colab URL");
}
// Handle GitHub URLs
const githubParts = parseGitHubUrl(url);
if (!githubParts) {
throw new Error("Invalid GitHub URL format");
}
const { owner, repo, treeSha, file } = githubParts;
// If it's a specific notebook file
if (file && file.endsWith('.ipynb')) {
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${treeSha}/${file}`;
const content = await getNotebookContent(rawUrl);
const tokenCount = estimateTokens(content);
return {
content: [{
type: "text",
text: `${content}\n\n--- STATS ---\nFile: ${file}\nToken Count: ${tokenCount}\nSize: ${Buffer.byteLength(content, 'utf8')} bytes`
}]
};
}
// Full repository extraction
const { filteredFiles, paths } = await getRepoTree(owner, repo, treeSha, exclude, sizeLimitKB);
const fileContents = await Promise.all(
filteredFiles.map(async (file) => {
const content = await getFileContent(owner, repo, file.sha);
return {
path: file.path,
content
};
})
);
const fileTree = generateFileTree(paths);
let output = `Repository: ${owner}/${repo}\nBranch/Commit: ${treeSha}\n\n`;
output += `File Tree:\n${fileTree}\n\n`;
fileContents.forEach((file) => {
output += `--- FILE: ${file.path} ---\n`;
output += `${file.content}\n\n`;
});
const totalSizeKB = fileContents.reduce((acc, file) => acc + Buffer.byteLength(file.content, 'utf8'), 0) / 1024;
const tokenCount = estimateTokens(output);
output += `--- STATS ---\nFiles: ${fileContents.length}\nTotal Size: ${totalSizeKB.toFixed(2)} KB\nToken Count: ${tokenCount}`;
return {
content: [{
type: "text",
text: output
}]
};
} catch (error) {
return {
content: [{
type: "text",
text: `Error: ${error.message}`
}]
};
}
}
);
// Tool 3: Fetch all content but exclude specific file extensions
server.tool("fetchAllContentButExclude",
{
url: z.string().describe("GitHub repository URL or Google Colab notebook URL"),
excludeExtensions: z.string().describe("Comma-separated file extensions to exclude (e.g., 'test.js,spec.ts,.md')"),
exclude: z.string().optional().default("").describe("Additional comma-separated glob patterns to exclude"),
sizeLimitKB: z.number().optional().default(64).describe("Maximum file size in KB to include")
},
async ({ url, excludeExtensions, exclude, sizeLimitKB }) => {
try {
// Convert extensions to glob patterns
const extensionPatterns = excludeExtensions.split(',')
.map(ext => ext.trim())
.filter(Boolean)
.map(ext => ext.startsWith('.') ? `**/*${ext}` : `**/*.${ext}`)
.join(',');
const combinedExclude = [exclude, extensionPatterns].filter(Boolean).join(',');
// Handle Colab URLs
if (url.includes('colab.research.google.com')) {
const match = url.match(/colab\.research\.google\.com\/github\/([^/]+)\/([^/]+)\/blob\/([^/]+)\/(.+)/);
if (match) {
const [_, owner, repo, branch, file] = match;
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file}`;
const content = await getNotebookContent(rawUrl);
const tokenCount = estimateTokens(content);
return {
content: [{
type: "text",
text: `${content}\n\n--- STATS ---\nFile: ${file}\nToken Count: ${tokenCount}\nSize: ${Buffer.byteLength(content, 'utf8')} bytes`
}]
};
}
throw new Error("Could not parse Colab URL");
}
// Handle GitHub URLs
const githubParts = parseGitHubUrl(url);
if (!githubParts) {
throw new Error("Invalid GitHub URL format");
}
const { owner, repo, treeSha, file } = githubParts;
// If it's a specific notebook file
if (file && file.endsWith('.ipynb')) {
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${treeSha}/${file}`;
const content = await getNotebookContent(rawUrl);
const tokenCount = estimateTokens(content);
return {
content: [{
type: "text",
text: `${content}\n\n--- STATS ---\nFile: ${file}\nToken Count: ${tokenCount}\nSize: ${Buffer.byteLength(content, 'utf8')} bytes`
}]
};
}
// Full repository extraction with enhanced exclusion
const { filteredFiles, paths } = await getRepoTree(owner, repo, treeSha, combinedExclude, sizeLimitKB);
const fileContents = await Promise.all(
filteredFiles.map(async (file) => {
const content = await getFileContent(owner, repo, file.sha);
return {
path: file.path,
content
};
})
);
const fileTree = generateFileTree(paths);
let output = `Repository: ${owner}/${repo}\nBranch/Commit: ${treeSha}\n\n`;
output += `File Tree:\n${fileTree}\n\n`;
fileContents.forEach((file) => {
output += `--- FILE: ${file.path} ---\n`;
output += `${file.content}\n\n`;
});
const totalSizeKB = fileContents.reduce((acc, file) => acc + Buffer.byteLength(file.content, 'utf8'), 0) / 1024;
const tokenCount = estimateTokens(output);
output += `--- STATS ---\nFiles: ${fileContents.length}\nTotal Size: ${totalSizeKB.toFixed(2)} KB\nToken Count: ${tokenCount}`;
return {
content: [{
type: "text",
text: output
}]
};
} catch (error) {
return {
content: [{
type: "text",
text: `Error: ${error.message}`
}]
};
}
}
);
// Tool 4: Fetch specific file content
server.tool("specificContent",
{
url: z.string().describe("GitHub repository URL or Google Colab notebook URL"),
filePath: z.string().describe("Specific file path to fetch (for repos) or cell index (for notebooks, e.g., 'cell-3')")
},
async ({ url, filePath }) => {
try {
// Handle Colab URLs
if (url.includes('colab.research.google.com')) {
const match = url.match(/colab\.research\.google\.com\/github\/([^/]+)\/([^/]+)\/blob\/([^/]+)\/(.+)/);
if (match) {
const [_, owner, repo, branch, file] = match;
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file}`;
const response = await fetch(rawUrl);
if (!response.ok) {
throw new Error(`Failed to fetch notebook. Status: ${response.status}`);
}
const notebook = await response.json();
// Handle cell-specific requests
if (filePath.startsWith('cell-')) {
const cellIndex = parseInt(filePath.replace('cell-', '')) - 1;
if (cellIndex < 0 || cellIndex >= notebook.cells.length) {
throw new Error(`Cell ${cellIndex + 1} not found. Notebook has ${notebook.cells.length} cells.`);
}
const cell = notebook.cells[cellIndex];
const cellType = cell.cell_type.toUpperCase();
const source = cell.source.join('');
return {
content: [{
type: "text",
text: `--- CELL ${cellIndex + 1} (${cellType}) ---\n${source}`
}]
};
} else {
// Return full notebook if no specific cell requested
const content = await getNotebookContent(rawUrl);
return {
content: [{
type: "text",
text: content
}]
};
}
}
throw new Error("Could not parse Colab URL");
}
// Handle GitHub URLs
const githubParts = parseGitHubUrl(url);
if (!githubParts) {
throw new Error("Invalid GitHub URL format");
}
const { owner, repo, treeSha } = githubParts;
// Get the specific file
const { data: treeData } = await octokit.rest.git.getTree({
owner,
repo,
tree_sha: treeSha,
recursive: '1',
});
const targetFile = treeData.tree.find(file => file.path === filePath && file.type === 'blob');
if (!targetFile) {
throw new Error(`File '${filePath}' not found in repository`);
}
const content = await getFileContent(owner, repo, targetFile.sha);
const tokenCount = estimateTokens(content);
return {
content: [{
type: "text",
text: `--- FILE: ${filePath} ---\n${content}\n\n--- STATS ---\nToken Count: ${tokenCount}\nSize: ${Buffer.byteLength(content, 'utf8')} bytes`
}]
};
} catch (error) {
return {
content: [{
type: "text",
text: `Error: ${error.message}`
}]
};
}
}
);
const transport = new StdioServerTransport();
await server.connect(transport);