mcard-js
Version:
MCard - Content-addressable storage with cryptographic hashing, handle resolution, and vector search for Node.js and browsers
190 lines • 7.77 kB
JavaScript
import * as fs from 'fs/promises';
import * as path from 'path';
import { MCard } from './model/MCard';
import { isProblematicFile, processFileContent, streamReadNormalizedText, listFiles } from './FileIO';
import { ContentTypeInterpreter } from './model/ContentTypeInterpreter';
// Constants replicating settings
const DEFAULT_MAX_PROBLEM_BYTES = 2 * 1024 * 1024; // 2MB
const WRAP_WIDTH_KNOWN = 1000;
const WRAP_WIDTH_DEFAULT = 80;
export async function processAndStoreFile(filePath, collection, options = {}) {
const { allowProblematic = false, maxBytesOnProblem = DEFAULT_MAX_PROBLEM_BYTES, metadataOnly = false, rootPath } = options;
try {
let fileInfo;
if (await isProblematicFile(filePath)) {
if (!allowProblematic) {
console.warn(`Skipping problematic file: ${filePath}`);
return null;
}
const extension = path.extname(filePath).toLowerCase();
const isKnownType = ContentTypeInterpreter.isKnownLongLineExtension(extension);
const wrapWidth = isKnownType ? WRAP_WIDTH_KNOWN : WRAP_WIDTH_DEFAULT;
console.warn(`Problematic file detected, processing as safe text: ${filePath}`);
try {
const streamed = await streamReadNormalizedText(filePath, {
byteCap: maxBytesOnProblem,
wrapWidth
});
fileInfo = {
content: streamed.text,
filename: path.basename(filePath),
mimeType: 'text/plain',
extension: extension,
isBinary: false,
size: streamed.text.length,
originalSize: streamed.originalSize,
originalSha256Prefix: streamed.originalSha256Prefix,
normalized: true,
wrapWidth
};
}
catch (e) {
console.warn(`Safe text processing failed, falling back to capped binary: ${filePath}`);
fileInfo = await processFileContent(filePath, {
forceBinary: true,
allowPathological: true,
maxBytes: maxBytesOnProblem
});
}
}
else {
console.log(`Processing file: ${filePath}`);
fileInfo = await processFileContent(filePath);
}
if (!fileInfo)
return null;
// Check for empty content (e.g., empty __init__.py files)
const content = fileInfo.content;
if (!content || (typeof content === 'string' && content.length === 0) ||
(content instanceof Uint8Array && content.length === 0)) {
// Empty files cannot be stored as MCards - skip gracefully
if (process.env.DEBUG) {
console.log(`Skipping empty file: ${filePath} (empty files cannot be stored as MCards)`);
}
return {
hash: '',
contentType: fileInfo.mimeType,
isBinary: fileInfo.isBinary,
filename: fileInfo.filename,
size: 0,
filePath: filePath,
};
}
let mcard = null;
const isProblematic = await isProblematicFile(filePath);
if (metadataOnly && isProblematic) {
mcard = null;
}
else {
mcard = await MCard.create(fileInfo.content);
// Handle Logic
const handle = path.basename(filePath);
try {
await collection.addWithHandle(mcard, handle);
}
catch (e) {
let registered = false;
if (rootPath) {
const relPath = path.relative(rootPath, filePath);
if (relPath !== handle) {
try {
await collection.addWithHandle(mcard, relPath);
registered = true;
}
catch (e2) {
// Handle name already in use - this is expected for common files like README.md, LICENSE
// The MCard is still stored and accessible by hash
if (process.env.DEBUG) {
console.log(`Handle name '${handle}' already in use (common for files like README.md, LICENSE). ` +
`MCard stored successfully with hash ${mcard.hash.slice(0, 8)}... (accessible by hash, not by handle)`);
}
}
}
}
if (!registered) {
try {
await collection.add(mcard);
}
catch (e3) { }
}
}
}
const result = {
hash: mcard ? mcard.hash : 'METADATA_ONLY',
contentType: fileInfo.mimeType,
isBinary: fileInfo.isBinary,
filename: fileInfo.filename,
size: fileInfo.size,
filePath: filePath,
};
if (fileInfo.originalSize !== undefined)
result.originalSize = fileInfo.originalSize;
if (fileInfo.originalSha256Prefix)
result.originalSha256Prefix = fileInfo.originalSha256Prefix;
if (metadataOnly && isProblematic)
result.metadataOnly = true;
return result;
}
catch (e) {
console.error(`Error processing ${filePath}:`, e);
return null;
}
}
export async function loadFileToCollection(targetPath, collection, options = {}) {
const { recursive = false, includeProblematic = false, maxBytesOnProblem = DEFAULT_MAX_PROBLEM_BYTES, metadataOnly = false } = options;
// Resolve absolute path
const resolvedPath = path.resolve(targetPath);
const stats = await fs.stat(resolvedPath);
const results = [];
// Determine files to process
let files = [];
let rootPath = resolvedPath;
if (stats.isFile()) {
files = [resolvedPath];
rootPath = path.dirname(resolvedPath);
}
else if (stats.isDirectory()) {
files = await listFiles(resolvedPath, recursive);
rootPath = resolvedPath;
}
else {
throw new Error(`Path ${targetPath} is not a file or directory`);
}
// Calculate Metrics
const uniqueDirs = new Set();
let maxDepth = 0;
for (const file of files) {
const dir = path.dirname(file);
// We track directories relative to root to verify "loaded directories"
// If file is directly in root, dir is root.
if (dir.startsWith(rootPath)) {
uniqueDirs.add(dir);
// Depth calculation
const rel = path.relative(rootPath, file);
// parts length - 1 (filename) gives folder depth.
// e.g. "a.txt" -> 0 depth. "sub/a.txt" -> 1 depth.
const parts = rel.split(path.sep);
const depth = parts.length - 1;
if (depth > maxDepth)
maxDepth = depth;
}
}
const metrics = {
filesCount: files.length,
directoriesCount: uniqueDirs.size,
directoryLevels: maxDepth
};
console.log(`About to process ${files.length} files`);
for (const file of files) {
const result = await processAndStoreFile(file, collection, {
allowProblematic: includeProblematic,
maxBytesOnProblem,
metadataOnly,
rootPath
});
if (result)
results.push(result);
}
return { metrics, results };
}
//# sourceMappingURL=Loader.js.map