zrald1
Version:
Advanced Graph RAG MCP Server with file location identification, graph processing, and result summarization capabilities
254 lines • 10 kB
JavaScript
import * as fs from 'fs/promises';
import * as path from 'path';
import { v4 as uuidv4 } from 'uuid';
export class FileProcessor {
options;
defaultOptions = {
recursive: true,
maxFileSize: 10 * 1024 * 1024, // 10MB
allowedExtensions: ['.txt', '.md', '.js', '.ts', '.py', '.json', '.xml', '.html', '.css'],
excludePatterns: ['node_modules', '.git', 'dist', 'build', '.env'],
includeContent: true
};
constructor(options = {}) {
this.options = options;
this.options = { ...this.defaultOptions, ...options };
}
async identifyFiles(searchOptions) {
const files = [];
const { query, searchPaths, fileTypes, recursive = true, maxResults = 50, caseSensitive = false } = searchOptions;
for (const searchPath of searchPaths) {
try {
const foundFiles = await this.searchInDirectory(searchPath, query, fileTypes, recursive, caseSensitive);
files.push(...foundFiles);
if (files.length >= maxResults) {
break;
}
}
catch (error) {
console.warn(`Error searching in ${searchPath}:`, error instanceof Error ? error.message : String(error));
}
}
return files.slice(0, maxResults);
}
async searchInDirectory(dirPath, query, fileTypes, recursive = true, caseSensitive = false) {
const files = [];
const queryLower = caseSensitive ? query : query.toLowerCase();
try {
const entries = await fs.readdir(dirPath, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dirPath, entry.name);
// Skip excluded patterns
if (this.shouldExclude(entry.name)) {
continue;
}
if (entry.isDirectory() && recursive) {
const subFiles = await this.searchInDirectory(fullPath, query, fileTypes, recursive, caseSensitive);
files.push(...subFiles);
}
else if (entry.isFile()) {
const fileName = caseSensitive ? entry.name : entry.name.toLowerCase();
const extension = path.extname(entry.name);
// Check file type filter
if (fileTypes && fileTypes.length > 0) {
if (!fileTypes.includes(extension)) {
continue;
}
}
// Check if file matches query
if (fileName.includes(queryLower)) {
const file = await this.processFile(fullPath);
if (file) {
files.push(file);
}
}
else if (this.options.includeContent) {
// Search in file content
const hasContentMatch = await this.searchInFileContent(fullPath, queryLower, caseSensitive);
if (hasContentMatch) {
const file = await this.processFile(fullPath);
if (file) {
files.push(file);
}
}
}
}
}
}
catch (error) {
console.warn(`Error reading directory ${dirPath}:`, error instanceof Error ? error.message : String(error));
}
return files;
}
async searchInFileContent(filePath, query, caseSensitive) {
try {
const stats = await fs.stat(filePath);
// Skip large files
if (stats.size > this.options.maxFileSize) {
return false;
}
const extension = path.extname(filePath);
// Only search in allowed text files
if (!this.options.allowedExtensions.includes(extension)) {
return false;
}
const content = await fs.readFile(filePath, 'utf-8');
const searchContent = caseSensitive ? content : content.toLowerCase();
return searchContent.includes(query);
}
catch (error) {
// File might be binary or inaccessible
return false;
}
}
async processFile(filePath) {
try {
const stats = await fs.stat(filePath);
const parsedPath = path.parse(filePath);
// Skip large files
if (stats.size > this.options.maxFileSize) {
console.warn(`File ${filePath} is too large (${stats.size} bytes), skipping`);
return null;
}
const file = {
id: uuidv4(),
path: filePath,
name: parsedPath.name,
extension: parsedPath.ext,
size: stats.size,
metadata: {
created: stats.birthtime,
modified: stats.mtime,
accessed: stats.atime,
directory: parsedPath.dir,
fullName: parsedPath.base
},
created_at: new Date(),
updated_at: new Date()
};
// Include content if requested and file is text-based
if (this.options.includeContent && this.isTextFile(parsedPath.ext)) {
try {
file.content = await fs.readFile(filePath, 'utf-8');
}
catch (error) {
console.warn(`Could not read content of ${filePath}:`, error instanceof Error ? error.message : String(error));
}
}
return file;
}
catch (error) {
console.warn(`Error processing file ${filePath}:`, error instanceof Error ? error.message : String(error));
return null;
}
}
async processDirectory(dirPath) {
const files = [];
try {
const entries = await fs.readdir(dirPath, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dirPath, entry.name);
if (this.shouldExclude(entry.name)) {
continue;
}
if (entry.isDirectory() && this.options.recursive) {
const subFiles = await this.processDirectory(fullPath);
files.push(...subFiles);
}
else if (entry.isFile()) {
const file = await this.processFile(fullPath);
if (file) {
files.push(file);
}
}
}
}
catch (error) {
console.warn(`Error processing directory ${dirPath}:`, error instanceof Error ? error.message : String(error));
}
return files;
}
filesToNodes(files) {
return files.map(file => ({
id: file.id,
type: 'file',
label: file.name,
properties: {
path: file.path,
extension: file.extension,
size: file.size,
content: file.content
},
metadata: file.metadata,
created_at: file.created_at,
updated_at: file.updated_at
}));
}
fileToChunks(file, chunkSize = 1000, overlap = 100) {
if (!file.content) {
return [];
}
const chunks = [];
const content = file.content;
let position = 0;
for (let i = 0; i < content.length; i += chunkSize - overlap) {
const chunkContent = content.slice(i, i + chunkSize);
chunks.push({
id: uuidv4(),
content: chunkContent,
document_id: file.id,
position: position++,
entities: [],
metadata: {
file_path: file.path,
file_name: file.name,
start_char: i,
end_char: Math.min(i + chunkSize, content.length)
}
});
}
return chunks;
}
shouldExclude(name) {
return this.options.excludePatterns.some(pattern => name.includes(pattern) || name.startsWith('.'));
}
isTextFile(extension) {
return this.options.allowedExtensions.includes(extension);
}
// Generate summary of file processing results
generateSummary(files) {
const summary = {
total_files: files.length,
file_types: {},
total_size: 0,
largest_file: null,
smallest_file: null,
directories: new Set(),
processing_timestamp: new Date().toISOString()
};
for (const file of files) {
// Count file types
const ext = file.extension || 'no_extension';
summary.file_types[ext] = (summary.file_types[ext] || 0) + 1;
// Calculate total size
summary.total_size += file.size;
// Track largest and smallest files
if (!summary.largest_file || file.size > summary.largest_file.size) {
summary.largest_file = file;
}
if (!summary.smallest_file || file.size < summary.smallest_file.size) {
summary.smallest_file = file;
}
// Track directories
if (file.metadata?.directory) {
summary.directories.add(file.metadata.directory);
}
}
return {
...summary,
directories: Array.from(summary.directories),
average_file_size: files.length > 0 ? summary.total_size / files.length : 0
};
}
}
//# sourceMappingURL=file-processor.js.map