mongodocs-mcp
Version:
Lightning-fast semantic search for MongoDB documentation via Model Context Protocol. 10,000+ documents, <500ms search.
335 lines ⢠12 kB
JavaScript
/**
* Universal Document Fetcher - The ONLY fetcher you need
* Intelligently fetches from GitHub, Web, and APIs
*
* This replaces:
* - DocumentFetcher (GitHub only)
* - VoyageDocumentFetcher (Voyage only)
* - MegaDocumentFetcher (overcomplicated)
* - Inline fetching in index-docs.ts
*/
import axios from 'axios';
import crypto from 'crypto';
import pLimit from 'p-limit';
import * as cheerio from 'cheerio';
import { simpleGit } from 'simple-git';
import * as fs from 'fs/promises';
import * as path from 'path';
export class UniversalFetcher {
githubToken;
rateLimiter = pLimit(5);
constructor() {
this.githubToken = process.env.GITHUB_TOKEN;
}
/**
* Fetch from any source intelligently
*/
async fetchFromSource(source) {
console.log(`š„ Fetching ${source.name} (${source.type})...`);
try {
switch (source.type) {
case 'github':
return await this.fetchFromGitHub(source);
case 'web':
return await this.fetchFromWeb(source);
case 'api':
return await this.fetchFromAPI(source);
default:
console.error(`Unknown source type: ${source.type}`);
return [];
}
}
catch (error) {
console.error(`Failed to fetch ${source.name}:`, error);
return [];
}
}
/**
* Fetch from multiple sources in parallel
*/
async fetchFromSources(sources) {
console.log(`š Fetching from ${sources.length} sources...`);
// Group by priority
const priorityGroups = new Map();
sources.forEach(source => {
const priority = source.priority || 1;
if (!priorityGroups.has(priority)) {
priorityGroups.set(priority, []);
}
priorityGroups.get(priority).push(source);
});
// Process by priority (highest first)
const allDocuments = [];
const priorities = Array.from(priorityGroups.keys()).sort((a, b) => b - a);
for (const priority of priorities) {
console.log(`\nš Processing priority ${priority} sources...`);
const group = priorityGroups.get(priority);
// Process each priority group in parallel
const results = await Promise.allSettled(group.map(source => this.fetchFromSource(source)));
results.forEach((result, index) => {
if (result.status === 'fulfilled') {
allDocuments.push(...result.value);
console.log(`ā
${group[index].name}: ${result.value.length} documents`);
}
else {
console.error(`ā ${group[index].name}: Failed`);
}
});
}
// Deduplicate
return this.deduplicateDocuments(allDocuments);
}
/**
* GitHub repository fetching (optimized from original)
*/
async fetchFromGitHub(source) {
if (!source.repo)
return [];
const repoPath = path.join('mongodb-docs', source.repo.replace('/', '_'), source.branch || 'main');
// Clone or update repo
await this.cloneOrUpdateRepo(source.repo, source.branch || 'main', repoPath);
// Find documentation files
const files = await this.findDocFiles(repoPath);
const documents = [];
// Process files in batches
const batchSize = 50;
for (let i = 0; i < files.length; i += batchSize) {
const batch = files.slice(i, i + batchSize);
const batchDocs = await Promise.all(batch.map(file => this.processFile(file, source)));
documents.push(...batchDocs.filter(doc => doc !== null));
}
return documents;
}
/**
* Web page fetching (from original mega-fetcher)
*/
async fetchFromWeb(source) {
if (!source.url)
return [];
try {
const response = await this.rateLimiter(() => axios.get(source.url, { timeout: 30000 }));
const $ = cheerio.load(response.data);
const documents = [];
// Extract content intelligently
const selectors = [
'article', '.documentation', '.content', 'main',
'.doc-content', '.markdown-body', '[role="main"]'
];
selectors.forEach(selector => {
$(selector).each((_, elem) => {
const content = this.extractCleanText($, elem);
if (content.length > 100) {
documents.push({
id: this.generateId(`${source.url}-${selector}`),
content,
metadata: {
path: source.url,
product: source.product,
version: source.version,
title: source.name,
url: source.url
}
});
}
});
});
return documents;
}
catch (error) {
console.error(`Failed to fetch web page ${source.url}:`, error);
return [];
}
}
/**
* API fetching (for future expansion)
*/
async fetchFromAPI(source) {
if (!source.url)
return [];
try {
const response = await this.rateLimiter(() => axios.get(source.url, {
headers: {
'Accept': 'application/json',
'Authorization': this.githubToken ? `Bearer ${this.githubToken}` : undefined
},
timeout: 30000
}));
// Process API response based on structure
const data = response.data;
if (Array.isArray(data)) {
return data.map(item => this.processAPIItem(item, source))
.filter(doc => doc !== null);
}
else if (data.content) {
return [this.processAPIItem(data, source)].filter(doc => doc !== null);
}
return [];
}
catch (error) {
console.error(`Failed to fetch API ${source.url}:`, error);
return [];
}
}
/**
* Clone or update a git repository
*/
async cloneOrUpdateRepo(repo, branch, targetPath) {
const git = simpleGit();
try {
await fs.access(targetPath);
// Repo exists, update it
await git.cwd(targetPath);
await git.checkout(branch);
await git.pull('origin', branch);
}
catch {
// Clone new repo
await fs.mkdir(path.dirname(targetPath), { recursive: true });
await git.clone(`https://github.com/${repo}.git`, targetPath, ['--branch', branch, '--depth', '1', '--single-branch']);
}
}
/**
* Find documentation files recursively
*/
async findDocFiles(dir) {
const files = [];
const extensions = ['.md', '.mdx', '.rst', '.txt', '.json', '.yaml', '.yml'];
const skipDirs = ['node_modules', '.git', 'build', 'dist', 'coverage', 'test'];
async function walk(currentDir) {
try {
const entries = await fs.readdir(currentDir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(currentDir, entry.name);
if (entry.isDirectory() && !skipDirs.includes(entry.name)) {
await walk(fullPath);
}
else if (entry.isFile()) {
const ext = path.extname(entry.name).toLowerCase();
if (extensions.includes(ext)) {
files.push(fullPath);
}
}
}
}
catch (error) {
// Ignore inaccessible directories
}
}
await walk(dir);
return files;
}
/**
* Process a file into a document
*/
async processFile(filePath, source) {
try {
const content = await fs.readFile(filePath, 'utf-8');
if (content.length < 50)
return null;
const cleanedContent = this.cleanContent(content, path.extname(filePath));
return {
id: this.generateId(filePath),
content: cleanedContent,
metadata: {
path: filePath,
product: source.product,
version: source.version,
title: this.extractTitle(cleanedContent) || path.basename(filePath),
url: `https://github.com/${source.repo}/blob/${source.branch}/${path.relative(process.cwd(), filePath)}`
}
};
}
catch (error) {
return null;
}
}
/**
* Process API response item
*/
processAPIItem(item, source) {
if (!item.content && !item.description && !item.body)
return null;
const content = item.content || item.description || item.body || '';
if (content.length < 50)
return null;
return {
id: this.generateId(item.id || item.name || content),
content,
metadata: {
path: item.path || item.url || source.url,
product: source.product,
version: source.version,
title: item.title || item.name || source.name,
url: item.url || source.url
}
};
}
/**
* Clean content based on file type
*/
cleanContent(content, extension) {
// Remove common boilerplate
let cleaned = content
.replace(/<!--.*?-->/gs, '') // HTML comments
.replace(/^\s*#\s*Table of Contents.*?(?=^#)/ms, '') // TOC
.replace(/^---[\s\S]*?---/m, ''); // Front matter
// Format based on extension
if (extension === '.rst') {
// Clean RST markup
cleaned = cleaned
.replace(/^\.\. .+::.*/gm, '')
.replace(/^ :.+:.*/gm, '')
.replace(/::\w+:`([^`]+)`/g, '$1')
.replace(/^\.\. _.*:/gm, '');
}
return cleaned.trim();
}
/**
* Extract clean text from HTML
*/
extractCleanText($, element) {
// Remove script and style elements
$('script, style, nav, header, footer', element).remove();
// Get text and clean it
return $(element).text()
.replace(/\s+/g, ' ')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
/**
* Extract title from content
*/
extractTitle(content) {
// Try markdown title
const mdMatch = content.match(/^#\s+(.+)$/m);
if (mdMatch)
return mdMatch[1].trim();
// Try RST title
const lines = content.split('\n');
for (let i = 0; i < Math.min(5, lines.length - 1); i++) {
if (/^[=\-~]+$/.test(lines[i + 1]) && lines[i].length > 0) {
return lines[i].trim();
}
}
return null;
}
/**
* Generate unique document ID
*/
generateId(input) {
return crypto.createHash('sha256').update(input).digest('hex').substring(0, 16);
}
/**
* Deduplicate documents by ID
*/
deduplicateDocuments(documents) {
const seen = new Set();
return documents.filter(doc => {
if (seen.has(doc.id))
return false;
seen.add(doc.id);
return true;
});
}
}
//# sourceMappingURL=universal-fetcher.js.map