UNPKG

mcard-js

Version:

MCard - Content-addressable storage with cryptographic hashing, handle resolution, and vector search for Node.js and browsers

126 lines 4.46 kB
import * as fs from 'fs'; import * as path from 'path'; /** * Find the project root directory by looking for pyproject.toml */ export function findProjectRoot(startDir = process.cwd()) { let searchDir = startDir; for (let i = 0; i < 5; i++) { if (fs.existsSync(path.join(searchDir, 'pyproject.toml'))) { return searchDir; } searchDir = path.dirname(searchDir); } return startDir; } /** * List files in a directory, optionally recursively. * Skips hidden files and problematic binary files. */ export function listFiles(dirPath, recursive) { const files = []; if (!fs.existsSync(dirPath)) { return files; } const entries = fs.readdirSync(dirPath, { withFileTypes: true }); for (const entry of entries) { const fullPath = path.join(dirPath, entry.name); if (entry.isFile()) { // Skip hidden files and problematic patterns if (!entry.name.startsWith('.') && !isProblematicFile(fullPath)) { files.push(fullPath); } } else if (entry.isDirectory() && recursive) { files.push(...listFiles(fullPath, recursive)); } } return files; } /** * Check if a file is likely problematic (too large, binary garbage, etc.) */ export function isProblematicFile(filePath) { try { const stats = fs.statSync(filePath); // Skip files larger than 50MB if (stats.size > 50 * 1024 * 1024) return true; // Check for unstructured binary by sampling if (stats.size > 1024) { const fd = fs.openSync(filePath, 'r'); const buffer = Buffer.alloc(1024); fs.readSync(fd, buffer, 0, 1024, 0); fs.closeSync(fd); // Count null bytes and control characters let nullCount = 0; let controlCount = 0; for (let i = 0; i < buffer.length; i++) { if (buffer[i] === 0) nullCount++; else if (buffer[i] < 32 && buffer[i] !== 9 && buffer[i] !== 10 && buffer[i] !== 13) controlCount++; } // If >30% null bytes, likely unstructured binary if (nullCount > 300) return true; } return false; } catch { return true; } } /** * Detect content type based on extension and content buffer. */ import { ContentTypeInterpreter } from '../../model/ContentTypeInterpreter.js'; /** * Detect content type based on extension and content buffer. * Delegates to the central ContentTypeInterpreter. */ export function detectContentType(filePath, content) { const ext = path.extname(filePath).toLowerCase(); // Convert Buffer to Uint8Array for the interpreter const uint8Args = new Uint8Array(content.buffer, content.byteOffset, content.byteLength); const result = ContentTypeInterpreter.detectContentType(uint8Args, ext); return result.mimeType; } function toRecord(value) { return typeof value === 'object' && value !== null ? value : {}; } /** * Extract loader-specific parameters from CLM context, mirroring Python behavior. */ export function extractLoaderParams(ctx, defaults = {}) { const params = toRecord(ctx.params); const balanced = toRecord(ctx.balanced); const inputArgs = { ...toRecord(balanced.input_arguments), ...toRecord(ctx.input_arguments), }; const outputArgs = { ...toRecord(balanced.output_arguments), ...toRecord(ctx.output_arguments), }; const allParams = { ...inputArgs, ...outputArgs, ...params }; const sourceDir = allParams.source_dir ?? defaults.sourceDir ?? 'test_data'; const recursive = params.recursive !== undefined ? params.recursive !== false : allParams.recursive !== false; const dbPath = allParams.db_path ?? defaults.dbPath; return { params, inputArgs, outputArgs, allParams, sourceDir, recursive, dbPath, }; } export function computeTimingMetrics(startTime, processedCount) { const durationSeconds = (Date.now() - startTime) / 1000; const time_s = Math.round(durationSeconds * 10000) / 10000; const files_per_sec = durationSeconds > 0 ? Math.round((processedCount / durationSeconds) * 100) / 100 : 0; return { durationSeconds, time_s, files_per_sec }; } //# sourceMappingURL=FileSystemUtils.js.map