mcard-js
Version:
MCard - Content-addressable storage with cryptographic hashing, handle resolution, and vector search for Node.js and browsers
203 lines • 6.8 kB
JavaScript
import * as crypto from 'crypto';
import * as fs from 'fs/promises';
import * as path from 'path';
import { ContentTypeInterpreter } from './model/ContentTypeInterpreter';
/**
* Stream-read bytes up to byte_cap, decode, and soft wrap.
*/
export async function streamReadNormalizedText(filePath, options) {
const { byteCap, wrapWidth } = options;
const sha = crypto.createHash('sha256');
let totalSize = 0;
let producedText = '';
let currentLen = 0;
const handle = await fs.open(filePath, 'r');
try {
const buffer = new Uint8Array(8192);
let remaining = byteCap;
// Check if TextDecoder supports replacement (it does by default fatal: false)
const decoder = new TextDecoder('utf-8', { fatal: false });
// Use position variable manually to be safe with handle.read
let position = 0;
while (remaining > 0) {
const { bytesRead } = await handle.read(buffer, 0, Math.min(buffer.length, remaining), position);
if (bytesRead === 0)
break;
position += bytesRead;
const chunk = buffer.subarray(0, bytesRead);
sha.update(chunk);
totalSize += bytesRead;
remaining -= bytesRead;
const s = decoder.decode(chunk, { stream: true });
for (const ch of s) {
if (ch === '\r')
continue;
producedText += ch;
if (ch === '\n') {
currentLen = 0;
}
else {
currentLen++;
if (wrapWidth > 0 && currentLen >= wrapWidth) {
producedText += '\n';
currentLen = 0;
}
}
}
}
// Flush decoder
const s = decoder.decode();
for (const ch of s) {
if (ch === '\r')
continue;
producedText += ch;
if (ch === '\n') {
currentLen = 0;
}
else {
currentLen++;
if (wrapWidth > 0 && currentLen >= wrapWidth) {
producedText += '\n';
currentLen = 0;
}
}
}
}
finally {
await handle.close();
}
return {
text: producedText,
originalSize: totalSize,
originalSha256Prefix: sha.digest('hex').substring(0, 16)
};
}
// Constants
const MAX_FILE_SIZE = 50 * 1024 * 1024; // 50MB
const READ_TIMEOUT_MS = 5000;
/**
* Check if a file is likely to cause processing issues.
*/
export async function isProblematicFile(filePath) {
try {
const stats = await fs.stat(filePath);
if (stats.size === 0)
return false;
// Skip hidden files/directories (starting with .)
if (path.basename(filePath).startsWith('.'))
return true;
if (stats.size > MAX_FILE_SIZE)
return true;
const ext = path.extname(filePath);
const isKnownType = ContentTypeInterpreter.isKnownLongLineExtension(ext);
if (isKnownType && stats.size > 1024 * 1024)
return true; // >1MB known type skip
// Sample content
const handle = await fs.open(filePath, 'r');
try {
const buffer = new Uint8Array(32 * 1024);
const { bytesRead } = await handle.read(buffer, 0, buffer.length, 0);
const sample = buffer.subarray(0, bytesRead);
if (ContentTypeInterpreter.isUnstructuredBinary(sample))
return true;
if (ContentTypeInterpreter.hasPathologicalLines(sample, isKnownType))
return true;
}
finally {
await handle.close();
}
return false;
}
catch {
return true;
}
}
/**
* Safely read a file with limits and timeouts.
*/
export async function readFileSafely(filePath, options = {}) {
const stats = await fs.stat(filePath);
if (stats.size > MAX_FILE_SIZE)
throw new Error(`File too large: ${stats.size}`);
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), READ_TIMEOUT_MS);
try {
const handle = await fs.open(filePath, 'r');
try {
const buffer = new Uint8Array(stats.size);
await handle.read(buffer, 0, stats.size, 0);
return buffer;
}
finally {
await handle.close();
}
}
catch (e) {
if (e.name === 'AbortError')
throw new Error(`Read timeout for ${filePath}`);
throw e;
}
finally {
clearTimeout(timeout);
}
}
/**
* List files in directory, filtering out problematic ones.
*/
export async function listFiles(dirPath, recursive = false) {
let files = [];
try {
const entries = await fs.readdir(dirPath, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dirPath, entry.name);
// Skip hidden
if (entry.name.startsWith('.'))
continue;
if (entry.isDirectory()) {
if (recursive) {
files = files.concat(await listFiles(fullPath, true));
}
}
else if (entry.isFile()) {
if (!(await isProblematicFile(fullPath))) {
files.push(fullPath);
}
}
}
}
catch (e) {
console.warn(`Error listing directory ${dirPath}:`, e);
}
return files;
}
/**
* Process a file and return metadata and content.
*/
export async function processFileContent(filePath, options = {}) {
const rawContent = await readFileSafely(filePath, { allowPathological: options.allowPathological, maxBytes: options.maxBytes });
// Detect type using sample
const sample = rawContent.subarray(0, 1024 * 1024);
const detection = ContentTypeInterpreter.detectContentType(sample, path.extname(filePath));
let isBinary = ContentTypeInterpreter.isBinaryContent(sample, detection.mimeType);
if (options.forceBinary)
isBinary = true;
let content = rawContent;
if (!isBinary) {
try {
content = new TextDecoder('utf-8', { fatal: true }).decode(rawContent);
}
catch {
// Fallback to replacement
content = new TextDecoder('utf-8', { fatal: false }).decode(rawContent);
}
}
return {
content,
filename: path.basename(filePath),
mimeType: detection.mimeType,
extension: detection.extension,
isBinary,
size: rawContent.length
};
}
//# sourceMappingURL=FileIO.js.map