UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

1,178 lines (1,177 loc) 89 kB
/** * File Type Detection Utility * Centralized file detection for all multimodal file types * Uses multi-strategy approach for reliable type identification */ import { readFile, stat } from "fs/promises"; import { getGlobalDispatcher, interceptors, request } from "undici"; // Lazy-loaded processor singletons — avoids loading heavy media deps // (mediabunny, fluent-ffmpeg, music-metadata, adm-zip) on every generate() call. async function getVideoProcessor() { const mod = await import("../processors/media/VideoProcessor.js"); return mod.videoProcessor; } async function getAudioProcessor() { const mod = await import("../processors/media/AudioProcessor.js"); return mod.audioProcessor; } async function getArchiveProcessor() { const mod = await import("../processors/archive/ArchiveProcessor.js"); return mod.archiveProcessor; } import { tracers, ATTR, withSpan } from "../telemetry/index.js"; import { CSVProcessor } from "./csvProcessor.js"; import { ImageProcessor } from "./imageProcessor.js"; import { logger } from "./logger.js"; import { mimeHintToExtension, mimeHintToFileType, normalizeMimeHint, } from "./mimeTypeHints.js"; import { PDFProcessor } from "./pdfProcessor.js"; /** * Default retry configuration constants */ const DEFAULT_MAX_RETRIES = 3; const DEFAULT_RETRY_DELAY = 1000; // milliseconds /** * Retryable network error codes (Node.js/undici network errors) */ const RETRYABLE_ERROR_CODES = [ "ETIMEDOUT", "ECONNRESET", "ECONNREFUSED", "ENOTFOUND", "ENETUNREACH", "EAI_AGAIN", "EPIPE", "ECONNABORTED", "UND_ERR_CONNECT_TIMEOUT", "UND_ERR_HEADERS_TIMEOUT", "UND_ERR_BODY_TIMEOUT", "UND_ERR_SOCKET", ]; /** * Non-retryable HTTP status codes (client errors) */ const NON_RETRYABLE_STATUS_CODES = [400, 401, 403, 404, 405]; /** * Retryable HTTP status codes (server errors + rate limiting) */ const RETRYABLE_STATUS_CODES = [429, 500, 502, 503, 504]; /** * Check if an error is a recoverable network error that should be retried * * @param error - Error to check * @returns True if error is retryable (transient network issue) */ function isRetryableNetworkError(error) { if (!(error instanceof Error)) { return false; } const errorMessage = error.message.toLowerCase(); // Extract error code from various error shapes const errorWithCode = error; const errorCode = errorWithCode.code?.toUpperCase(); // Check for retryable network error codes if (errorCode && RETRYABLE_ERROR_CODES.includes(errorCode)) { return true; } // Check HTTP status code if present in error message (e.g., "HTTP 503") const httpStatusMatch = errorMessage.match(/http\s*(\d{3})/); if (httpStatusMatch) { const statusCode = parseInt(httpStatusMatch[1], 10); if (NON_RETRYABLE_STATUS_CODES.includes(statusCode)) { return false; } if (RETRYABLE_STATUS_CODES.includes(statusCode)) { return true; } } // Check error message for transient issues const transientKeywords = [ "timeout", "timed out", "connection reset", "econnreset", "etimedout", "network error", "socket hang up", "enotfound", "getaddrinfo", "unavailable", "service unavailable", ]; return transientKeywords.some((keyword) => errorMessage.includes(keyword)); } /** * Execute an operation with automatic retry logic on transient network errors * * @param operation - Async function to execute * @param options - Retry configuration options * @returns Promise resolving to the operation result * @throws Error if all retry attempts fail or error is non-retryable */ async function withRetry(operation, options = {}) { const maxRetries = options.maxRetries ?? DEFAULT_MAX_RETRIES; const retryDelay = options.retryDelay ?? DEFAULT_RETRY_DELAY; for (let attempt = 0; attempt <= maxRetries; attempt++) { try { return await operation(); } catch (error) { const isRetryable = isRetryableNetworkError(error); const isLastAttempt = attempt === maxRetries; if (!isRetryable || isLastAttempt) { throw error; } // Calculate exponential backoff delay const delay = retryDelay * 2 ** attempt; logger.debug("Retrying network operation after transient error", { attempt: attempt + 1, maxRetries, delay, error: error instanceof Error ? error.message : String(error), }); await new Promise((resolve) => setTimeout(resolve, delay)); } } // TypeScript exhaustiveness check - should never reach here throw new Error("Retry logic failed unexpectedly"); } /** * Check if text has JSON markers (starts with { or [ and ends with corresponding closing bracket) */ function hasJsonMarkers(text) { const trimmed = text.trim(); if (!trimmed) { return false; } const firstChar = trimmed[0]; const lastChar = trimmed[trimmed.length - 1]; const hasMatchingBrackets = (firstChar === "{" && lastChar === "}") || (firstChar === "[" && lastChar === "]"); if (!hasMatchingBrackets) { return false; } try { JSON.parse(trimmed); return true; } catch { return false; } } /** * Format file size in human-readable units */ function formatFileSize(bytes) { if (bytes < 1024) { return `${bytes} bytes`; } if (bytes < 1024 * 1024) { return `${(bytes / 1024).toFixed(2)} KB`; } if (bytes < 1024 * 1024 * 1024) { return `${(bytes / (1024 * 1024)).toFixed(2)} MB`; } return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`; } /** * Centralized file type detection and processing * * @example * ```typescript * // Auto-detect and process any file * const result = await FileDetector.detectAndProcess("data.csv"); * logger.info(result.type); // 'csv' * ``` */ export class FileDetector { // FD-017: Replace hardcoded timeouts with constants. // These default ensure consistent timeout behavior across all file-detection logic. static DEFAULT_NETWORK_TIMEOUT = 30000; // 30 seconds static DEFAULT_HEAD_TIMEOUT = 5000; // 5 seconds /** * Auto-detect file type and process in one call * * Runs detection strategies in priority order: * 1. MagicBytesStrategy (95% confidence) - Binary file headers * 2. MimeTypeStrategy (85% confidence) - HTTP Content-Type for URLs * 3. ExtensionStrategy (70% confidence) - File extension * 4. ContentHeuristicStrategy (75% confidence) - Content analysis * * @param input - File path, URL, Buffer, or data URI * @param options - Detection and processing options * @returns Processed file result with type and content */ static async detectAndProcess(input, options) { // Derive filename and size for tracing before detection runs const inputFilename = FileDetector.deriveInputFilename(input); const inputSizeBytes = FileDetector.deriveInputSize(input); return withSpan({ name: "neurolink.file.detect_and_process", tracer: tracers.file, attributes: { [ATTR.FILE_NAME]: inputFilename, [ATTR.FILE_SIZE_BYTES]: inputSizeBytes, }, }, async (span) => { const detection = await FileDetector.detect(input, options); span.setAttribute(ATTR.FILE_CATEGORY, detection.type); span.setAttribute(ATTR.FILE_MIMETYPE, detection.mimeType || "unknown"); span.setAttribute(ATTR.FILE_CONFIDENCE, detection.metadata.confidence); logger.info(`[NEUROLINK] File detected: ${inputFilename} (${detection.mimeType || "unknown"}, ${formatFileSize(inputSizeBytes)}) → category: ${detection.type}`); // FD-018: Comprehensive fallback parsing for extension-less files if (options?.allowedTypes && !options.allowedTypes.includes(detection.type)) { const content = await FileDetector.loadContent(input, detection, options); const errors = []; for (const allowedType of options.allowedTypes) { try { const result = await FileDetector.tryFallbackParsing(content, allowedType, options); if (result) { logger.info(`[FileDetector] ✅ ${allowedType.toUpperCase()} fallback successful`); const outputLength = typeof result.content === "string" ? result.content.length : result.content?.length || 0; span.setAttribute(ATTR.FILE_OUTPUT_LENGTH, outputLength); span.setAttribute(ATTR.FILE_SUCCESS, true); span.setAttribute(ATTR.FILE_PROCESSOR_USED, `fallback:${allowedType}`); logger.info(`[NEUROLINK] File processed: ${inputFilename} → ${outputLength} bytes output (fallback: ${allowedType})`); return result; } } catch (error) { const errorMsg = error instanceof Error ? error.message : String(error); errors.push(`${allowedType}: ${errorMsg}`); logger.debug(`[FileDetector] ${allowedType} fallback failed: ${errorMsg}`); } } logger.warn(`[FileDetector] All fallback parsing failed for type "${detection.type}". ` + `Attempted: ${options.allowedTypes.join(", ")}. Falling through to universal handler.`); const csvOptions = options?.csvOptions; const result = await FileDetector.processFile(content, detection, csvOptions, options?.provider); FileDetector.setFileResultSpanAttributes(span, result, inputFilename, detection.type); return result; } const content = await FileDetector.loadContent(input, detection, options); const csvOptions = options?.csvOptions; const result = await FileDetector.processFile(content, detection, csvOptions, options?.provider); FileDetector.setFileResultSpanAttributes(span, result, inputFilename, detection.type); return result; }); } /** * Set span attributes and log after file processing completes. */ static setFileResultSpanAttributes(span, result, filename, processorType) { const outputLength = typeof result.content === "string" ? result.content.length : result.content?.length || 0; const hasImages = Array.isArray(result.images) ? result.images.length > 0 : false; const imageCount = Array.isArray(result.images) ? result.images.length : 0; span.setAttribute(ATTR.FILE_OUTPUT_LENGTH, outputLength); span.setAttribute(ATTR.FILE_SUCCESS, true); span.setAttribute(ATTR.FILE_PROCESSOR_USED, processorType); span.setAttribute(ATTR.FILE_HAS_IMAGES, hasImages); span.setAttribute(ATTR.FILE_IMAGE_COUNT, imageCount); logger.info(`[NEUROLINK] File processed: ${filename} → ${outputLength} bytes output` + (imageCount > 0 ? ` + ${imageCount} image(s)` : "") + ` (processor: ${processorType})`); } /** * Derive a human-readable filename from FileInput for tracing. */ static deriveInputFilename(input) { if (typeof input === "string") { if (input.startsWith("data:")) { return "data-uri"; } if (input.startsWith("http")) { try { return new URL(input).pathname.split("/").pop() || "url-file"; } catch { return "url-file"; } } // File path return input.split("/").pop() || input.split("\\").pop() || "file"; } if (Buffer.isBuffer(input)) { return "buffer"; } return "unknown-input"; } /** * Derive byte size from FileInput for tracing. */ static deriveInputSize(input) { if (Buffer.isBuffer(input)) { return input.length; } if (typeof input === "string") { if (input.startsWith("data:")) { // Rough estimate: base64 is ~4/3 of raw const base64Part = input.split(",")[1]; return base64Part ? Math.floor((base64Part.length * 3) / 4) : 0; } return input.length; // path or URL string length (not file size) } return 0; } /** * Classify a FileInput into the FileSource enum used by downstream * loaders. Keeps the mimetype-hint short-circuit in detect() able to * produce a valid FileDetectionResult without re-implementing the * source-inference rules scattered across loadContent(). */ static deriveInputSource(input) { if (Buffer.isBuffer(input)) { return "buffer"; } if (typeof input === "string") { if (input.startsWith("data:")) { return "datauri"; } if (input.startsWith("http://") || input.startsWith("https://")) { return "url"; } return "path"; } return "buffer"; } /** * Try fallback parsing for a specific file type * Used when file detection returns "unknown" but we want to try parsing anyway */ static async tryFallbackParsing(content, fileType, options) { logger.info(`[FileDetector] Attempting ${fileType.toUpperCase()} fallback parsing`); switch (fileType) { case "csv": { // Try CSV parsing const csvOptions = options?.csvOptions; const result = await CSVProcessor.process(content, csvOptions); logger.info(`[FileDetector] CSV fallback: ${result.metadata?.rowCount || 0} rows, ${result.metadata?.columnCount || 0} columns`); return result; } case "text": { // Try text parsing - check if content is valid UTF-8 text const textContent = content.toString("utf-8"); // Validate it's actually text (no null bytes, mostly printable) if (FileDetector.isValidText(textContent)) { return { type: "text", content: textContent, mimeType: FileDetector.guessTextMimeType(textContent), metadata: { confidence: 70, size: content.length, }, }; } throw new Error("Content does not appear to be valid text"); } case "image": { // Image requires magic bytes - can't fallback without detection throw new Error("Image type requires binary detection, cannot fallback parse"); } case "pdf": { // PDF requires magic bytes - can't fallback without detection throw new Error("PDF type requires binary detection, cannot fallback parse"); } case "audio": { // Audio requires magic bytes - can't fallback without detection throw new Error("Audio type requires binary detection, cannot fallback parse"); } case "video": { // Video requires magic bytes - can't fallback without detection throw new Error("Video type requires binary detection, cannot fallback parse"); } case "archive": { // Archive requires magic bytes - can't fallback without detection throw new Error("Archive type requires binary detection, cannot fallback parse"); } case "xlsx": { // Document formats require binary detection throw new Error("Excel type requires binary detection, cannot fallback parse"); } case "docx": { throw new Error("Word type requires binary detection, cannot fallback parse"); } case "pptx": { throw new Error("PowerPoint type requires binary detection, cannot fallback parse"); } case "svg": { // SVG can be detected from text content const svgContent = content.toString("utf-8"); if (svgContent.includes("<svg") && svgContent.includes("</svg>")) { return { type: "svg", content: svgContent, mimeType: "image/svg+xml", metadata: { confidence: 70, size: content.length, }, }; } throw new Error("Content does not appear to be valid SVG"); } default: return null; } } /** * Check if content is valid text (UTF-8, mostly printable) */ static isValidText(content) { // Check for null bytes which indicate binary content if (content.includes("\0")) { return false; } // Check if content has reasonable amount of printable characters let printableCount = 0; for (let i = 0; i < content.length; i++) { const code = content.charCodeAt(i); if ((code >= 32 && code < 127) || // ASCII printable code === 9 || // Tab code === 10 || // Newline code === 13 || // Carriage return code > 127 // Unicode (non-ASCII) ) { printableCount++; } } // At least 90% should be printable return printableCount / content.length >= 0.9; } /** * Guess the MIME type for text content based on content patterns */ static guessTextMimeType(content) { const trimmed = content.trim(); // Check for JSON if ((trimmed.startsWith("{") && trimmed.endsWith("}")) || (trimmed.startsWith("[") && trimmed.endsWith("]"))) { try { JSON.parse(trimmed); return "application/json"; } catch { // Not valid JSON, continue checking } } // Check for XML/HTML using stricter detection if (FileDetector.looksLikeXMLStrict(trimmed)) { const isHTML = trimmed.includes("<!DOCTYPE html") || trimmed.toLowerCase().includes("<html") || trimmed.includes("<head") || trimmed.includes("<body"); return isHTML ? "text/html" : "application/xml"; } // Check for YAML using robust multi-indicator detection if (FileDetector.looksLikeYAMLStrict(trimmed)) { return "application/yaml"; } // Default to plain text return "text/plain"; } /** * Strict YAML detection for guessTextMimeType * Similar to ContentHeuristicStrategy but requires at least 2 indicators * to avoid false positives from simple key: value patterns */ static looksLikeYAMLStrict(text) { if (text.length === 0) { return false; } const lines = text.split("\n"); // For single-line content, only --- or ... qualify as YAML if (lines.length === 1) { return text === "---" || text === "..."; } // Collect YAML indicators (requires at least 2 for positive detection) const indicators = []; // Indicator 1: Document start marker (---) indicators.push(text.startsWith("---")); // Indicator 2: Document end marker (...) indicators.push(/^\.\.\.$|[\n]\.\.\.$/.test(text)); // Indicator 3: YAML list items (- followed by space) indicators.push(/^[\s]*-\s+[^-]/m.test(text)); // Indicator 4: Multiple key-value pairs (at least 2) const keyValuePattern = /^[\s]*[a-zA-Z_][a-zA-Z0-9_-]*:\s*(.+)$/; const keyValueMatches = lines.filter((line) => keyValuePattern.test(line)).length; indicators.push(keyValueMatches >= 2); // Require at least 2 indicators for confident YAML detection const matchCount = indicators.filter(Boolean).length; return matchCount >= 2; } /** * Strict XML detection for guessTextMimeType * Ensures content has proper XML declaration or valid tag structure with closing tags * Prevents false positives from arbitrary content starting with < */ static looksLikeXMLStrict(content) { // XML declaration is a definitive marker if (content.startsWith("<?xml")) { return true; } // Must start with < for XML/HTML if (!content.startsWith("<")) { return false; } // Check for HTML DOCTYPE declaration if (content.includes("<!DOCTYPE html")) { return true; } // Must have valid opening tag structure: <tagname // Not just any < character like "< something" const hasValidOpeningTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/; if (!hasValidOpeningTag.test(content)) { return false; } // Must have at least one closing tag or self-closing tag to be valid XML/HTML const hasClosingTag = /<\/[a-zA-Z][a-zA-Z0-9-]*>/.test(content); const hasSelfClosingTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?\s*\/\s*>/.test(content); return hasClosingTag || hasSelfClosingTag; } /** * Detect file type using multi-strategy approach * Stops at first strategy with confidence >= threshold (default: 80%) */ static async detect(input, options) { // Short-circuit on a trustworthy caller-provided mimetype hint. This is // the eager-path counterpart to FileReferenceRegistry.register()'s hint // handling — necessary for tiny files (<= TINY_MAX) that skip the lazy // registry path. normalizeMimeHint drops "application/octet-stream" so a // caller cannot hide real content behind the opaque sentinel. const hintMime = normalizeMimeHint(options?.mimetypeHint); if (hintMime) { const type = mimeHintToFileType(hintMime); if (type) { const ext = mimeHintToExtension(hintMime); const result = { type, mimeType: hintMime, extension: ext || null, source: FileDetector.deriveInputSource(input), metadata: { confidence: 95, filename: FileDetector.deriveInputFilename(input), size: FileDetector.deriveInputSize(input), }, }; logger.info(`[FileDetector] Type: ${type} (95%, from mimetype hint: ${hintMime})`); return result; } } const confidenceThreshold = options?.confidenceThreshold ?? 80; const strategies = [ new MagicBytesStrategy(), new MimeTypeStrategy(), new ExtensionStrategy(), new ContentHeuristicStrategy(), ]; let best = null; for (const strategy of strategies) { const result = await strategy.detect(input); if (!best || result.metadata.confidence > best.metadata.confidence) { best = result; } if (result.metadata.confidence >= confidenceThreshold) { logger.info(`[FileDetector] Type: ${result.type} (${result.metadata.confidence}%)`); return result; } } logger.warn(`[FileDetector] Low confidence: ${best?.type ?? "unknown"} (${best?.metadata.confidence ?? 0}%)`); return best; } /** * Load file content from various sources */ static async loadContent(input, detection, options) { let source = detection.source; if (source === "buffer" && !Buffer.isBuffer(input)) { if (typeof input === "string") { if (input.startsWith("data:")) { source = "datauri"; } else if (input.startsWith("http://") || input.startsWith("https://")) { source = "url"; } else { source = "path"; } } } switch (source) { case "url": return await FileDetector.loadFromURL(input, options); case "path": return await FileDetector.loadFromPath(input, options); case "buffer": return input; case "datauri": return FileDetector.loadFromDataURI(input); default: throw new Error(`Unknown source: ${source}`); } } /** * SDK-8: Format an informative placeholder when a file processor fails. * Instead of bare "[Video file: name]" strings, include size, format, and * the reason for failure so the LLM can acknowledge the attachment. */ static formatInformativePlaceholder(typeName, filename, content, detection, error) { const sizeStr = content.length < 1024 ? `${content.length} bytes` : content.length < 1024 * 1024 ? `${(content.length / 1024).toFixed(1)} KB` : `${(content.length / (1024 * 1024)).toFixed(1)} MB`; const errorMsg = error instanceof Error ? error.message : error ? String(error) : "Processing returned no usable content"; return (`[${typeName} File: "${filename}"]\n` + `Size: ${sizeStr}\n` + `Format: ${detection.mimeType || "unknown"}\n` + `Error: Could not extract content (${errorMsg}).\n` + `The file was attached but could not be fully analyzed.`); } /** * Extract metadata and printable strings from an unrecognized binary file. * This is the "extract what you can" path for unknown file types. * * Extracts: * - File size (human-readable) * - MIME type / detected format * - First N bytes as hex dump (for identification) * - Printable ASCII/UTF-8 strings found in the binary (like `strings` command) * - Known file signatures that we don't have full processors for * * @param content Raw file buffer * @param detection Detection result (may be "unknown") * @param filename Original filename (if known) * @returns Formatted text summary suitable for LLM consumption */ static extractBinaryMetadata(content, detection, filename) { const parts = []; // Header const ext = detection.extension ? `.${detection.extension}` : filename.includes(".") ? filename.slice(filename.lastIndexOf(".")) : ""; const typeLabel = ext ? `${ext.toUpperCase().slice(1)} file` : "Binary file"; parts.push(`[${typeLabel}: "${filename}"]`); // Basic metadata const sizeStr = formatFileSize(content.length); parts.push(`Size: ${sizeStr}`); if (detection.mimeType && detection.mimeType !== "application/octet-stream") { parts.push(`Format: ${detection.mimeType}`); } // Known binary signature identification (broader than our processing capabilities) const sigLabel = FileDetector.identifyBinarySignature(content); if (sigLabel) { parts.push(`Identified as: ${sigLabel}`); } // Hex dump of first 32 bytes for identification const hexPreview = content .subarray(0, Math.min(32, content.length)) .toString("hex") .match(/.{1,2}/g) ?.join(" "); if (hexPreview) { parts.push(`Header bytes: ${hexPreview}`); } // Extract printable strings (similar to Unix `strings` command) const strings = FileDetector.extractPrintableStrings(content, 4, 50); if (strings.length > 0) { parts.push(`\nEmbedded text found (${strings.length} string${strings.length > 1 ? "s" : ""}):`); for (const s of strings) { parts.push(` "${s}"`); } } parts.push(`\nThis file was attached but its format is not fully supported for content extraction.`); parts.push(`The above metadata and any embedded text have been extracted for context.`); return parts.join("\n"); } /** * Identify known binary file signatures beyond what we can process. * Returns a human-readable description, or null if unrecognized. */ static identifyBinarySignature(buf) { if (buf.length < 4) { return null; } // SQLite: "SQLite format 3\0" if (buf.length >= 16 && buf.subarray(0, 15).toString("ascii") === "SQLite format 3") { return "SQLite database"; } // WOFF: "wOFF" if (buf[0] === 0x77 && buf[1] === 0x4f && buf[2] === 0x46 && buf[3] === 0x46) { return "WOFF font"; } // WOFF2: "wOF2" if (buf[0] === 0x77 && buf[1] === 0x4f && buf[2] === 0x46 && buf[3] === 0x32) { return "WOFF2 font"; } // TrueType/OpenType: starts with 0x00010000 or "OTTO" if ((buf[0] === 0x00 && buf[1] === 0x01 && buf[2] === 0x00 && buf[3] === 0x00) || (buf[0] === 0x4f && buf[1] === 0x54 && buf[2] === 0x54 && buf[3] === 0x4f)) { return "TrueType/OpenType font"; } // ELF executable: \x7fELF if (buf[0] === 0x7f && buf[1] === 0x45 && buf[2] === 0x4c && buf[3] === 0x46) { return "ELF executable/library"; } // Mach-O: 0xFEEDFACE or 0xFEEDFACF (64-bit) or 0xCAFEBABE (universal) if ((buf[0] === 0xfe && buf[1] === 0xed && buf[2] === 0xfa && buf[3] === 0xce) || (buf[0] === 0xfe && buf[1] === 0xed && buf[2] === 0xfa && buf[3] === 0xcf) || (buf[0] === 0xca && buf[1] === 0xfe && buf[2] === 0xba && buf[3] === 0xbe)) { return "Mach-O executable/library"; } // PE/Windows executable: "MZ" if (buf[0] === 0x4d && buf[1] === 0x5a) { return "Windows PE executable/DLL"; } // WebAssembly: "\0asm" if (buf[0] === 0x00 && buf[1] === 0x61 && buf[2] === 0x73 && buf[3] === 0x6d) { return "WebAssembly binary"; } // DWG (AutoCAD): starts with "AC10" if (buf[0] === 0x41 && buf[1] === 0x43 && buf[2] === 0x31 && buf[3] === 0x30) { return "AutoCAD DWG drawing"; } // BZ2: "BZ" + 'h' if (buf[0] === 0x42 && buf[1] === 0x5a && buf[2] === 0x68) { return "BZip2 compressed archive"; } // XZ: 0xFD + "7zXZ" if (buf.length >= 6 && buf[0] === 0xfd && buf[1] === 0x37 && buf[2] === 0x7a && buf[3] === 0x58 && buf[4] === 0x5a && buf[5] === 0x00) { return "XZ compressed archive"; } // 7z: "7z" + BC AF 27 1C if (buf.length >= 6 && buf[0] === 0x37 && buf[1] === 0x7a && buf[2] === 0xbc && buf[3] === 0xaf && buf[4] === 0x27 && buf[5] === 0x1c) { return "7-Zip archive"; } // ISO 9660: "CD001" at offset 32769 if (buf.length > 32773 && buf.subarray(32769, 32774).toString("ascii") === "CD001") { return "ISO 9660 disc image"; } // Apache Parquet: "PAR1" if (buf[0] === 0x50 && buf[1] === 0x41 && buf[2] === 0x52 && buf[3] === 0x31) { return "Apache Parquet data file"; } // Protocol Buffers compiled: (no fixed magic, skip) // TIFF (already handled as image, but including for completeness) if ((buf[0] === 0x49 && buf[1] === 0x49 && buf[2] === 0x2a && buf[3] === 0x00) || (buf[0] === 0x4d && buf[1] === 0x4d && buf[2] === 0x00 && buf[3] === 0x2a)) { return "TIFF image"; } // ICO: 00 00 01 00 if (buf[0] === 0x00 && buf[1] === 0x00 && buf[2] === 0x01 && buf[3] === 0x00) { return "ICO icon image"; } return null; } /** * Extract printable ASCII strings from a binary buffer. * Similar to the Unix `strings` utility. * * @param buf Buffer to scan * @param minLength Minimum string length to include (default 4) * @param maxStrings Maximum number of strings to return (default 50) * @returns Array of printable strings found in the binary */ static extractPrintableStrings(buf, minLength = 4, maxStrings = 50) { const strings = []; let current = ""; // Only scan first 64KB to avoid huge processing time const scanLimit = Math.min(buf.length, 64 * 1024); for (let i = 0; i < scanLimit; i++) { const byte = buf[i]; // Printable ASCII range (space through tilde) plus tab if ((byte >= 0x20 && byte <= 0x7e) || byte === 0x09) { current += String.fromCharCode(byte); } else { if (current.length >= minLength) { strings.push(current); if (strings.length >= maxStrings) { break; } } current = ""; } } // Flush last string if (current.length >= minLength && strings.length < maxStrings) { strings.push(current); } return strings; } /** * Route to appropriate processor */ static async processFile(content, detection, options, provider) { switch (detection.type) { case "csv": // Pass original extension through to CSV processor; if detection has none, // fall back to any extension provided in csvOptions. return await CSVProcessor.process(content, { ...options, extension: detection.extension ?? options?.extension, }); case "image": return await ImageProcessor.process(content); case "pdf": return await PDFProcessor.process(content, { provider }); case "svg": // SVG is processed as text content (sanitized XML markup) // AI providers don't support SVG as image format, so we extract text content return await FileDetector.processSvgAsText(content, detection); case "video": return await FileDetector.processVideoFile(content, detection); case "audio": return await FileDetector.processAudioFile(content, detection); case "archive": return await FileDetector.processArchiveFile(content, detection); case "xlsx": return await FileDetector.processXlsxFile(content, detection); case "docx": return await FileDetector.processDocxFile(content, detection); case "pptx": return await FileDetector.processPptxFile(content, detection); case "text": return { type: "text", content: content.toString("utf-8"), mimeType: detection.mimeType || "text/plain", metadata: detection.metadata, }; default: { // Graceful degradation: try to treat unknown types as text if content is valid UTF-8 const unknownContent = content.toString("utf-8"); if (FileDetector.isValidText(unknownContent)) { logger.warn(`[FileDetector] Unknown type "${detection.type}", treating as text`); return { type: "text", content: unknownContent, mimeType: detection.mimeType || "text/plain", metadata: detection.metadata, }; } // Binary file that we can't fully process — extract what we can // (metadata, printable strings, signature identification) const filename = detection.metadata.filename || "file"; logger.warn(`[FileDetector] Unknown binary type "${detection.type}", extracting metadata for "${filename}"`); return { type: "unknown", content: FileDetector.extractBinaryMetadata(content, detection, filename), mimeType: detection.mimeType || "application/octet-stream", metadata: detection.metadata, }; } } } /** * Process video file: extract metadata, keyframes, and subtitles via VideoProcessor */ static async processVideoFile(content, detection) { const videoFilename = detection.metadata.filename || "video"; try { const videoResult = await (await getVideoProcessor()).processFile({ id: videoFilename, name: videoFilename, mimetype: detection.mimeType || "video/mp4", size: content.length, buffer: content, }); if (videoResult.success && videoResult.data) { return { type: "video", content: videoResult.data.textContent || FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection), mimeType: detection.mimeType, images: videoResult.data.keyframes && videoResult.data.keyframes.length > 0 ? videoResult.data.keyframes : undefined, metadata: { ...detection.metadata, frameCount: videoResult.data.frameCount, hasKeyframes: videoResult.data.hasKeyframes, }, }; } } catch (videoError) { logger.warn(`[FileDetector] VideoProcessor failed for ${videoFilename}, using fallback`, videoError instanceof Error ? videoError.message : String(videoError)); return { type: "video", content: FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection, videoError), mimeType: detection.mimeType, metadata: detection.metadata, }; } // Fallback if processor returned no data return { type: "video", content: FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection), mimeType: detection.mimeType, metadata: detection.metadata, }; } /** * Process audio file: extract metadata, tags, and cover art via AudioProcessor */ static async processAudioFile(content, detection) { const audioFilename = detection.metadata.filename || "audio"; try { const audioResult = await (await getAudioProcessor()).processFile({ id: audioFilename, name: audioFilename, mimetype: detection.mimeType || "audio/mpeg", size: content.length, buffer: content, }); if (audioResult.success && audioResult.data) { return { type: "audio", content: audioResult.data.textContent || FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection), mimeType: detection.mimeType, // Surface embedded cover art as an image content block images: audioResult.data.coverArt ? [audioResult.data.coverArt] : undefined, metadata: detection.metadata, }; } } catch (audioError) { logger.warn(`[FileDetector] AudioProcessor failed for ${audioFilename}, using fallback`, audioError instanceof Error ? audioError.message : String(audioError)); return { type: "audio", content: FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection, audioError), mimeType: detection.mimeType, metadata: detection.metadata, }; } // Fallback if processor returned no data return { type: "audio", content: FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection), mimeType: detection.mimeType, metadata: detection.metadata, }; } /** * Process archive file: list contents and extract metadata via ArchiveProcessor */ static async processArchiveFile(content, detection) { const archiveFilename = detection.metadata.filename || "archive"; try { const archiveResult = await (await getArchiveProcessor()).processFile({ id: archiveFilename, name: archiveFilename, mimetype: detection.mimeType || "application/zip", size: content.length, buffer: content, }); if (archiveResult.success && archiveResult.data) { return { type: "archive", content: archiveResult.data.textContent || FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection), mimeType: detection.mimeType, metadata: detection.metadata, }; } } catch (archiveError) { logger.warn(`[FileDetector] ArchiveProcessor failed for ${archiveFilename}, using fallback`, archiveError instanceof Error ? archiveError.message : String(archiveError)); return { type: "archive", content: FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection, archiveError), mimeType: detection.mimeType, metadata: detection.metadata, }; } // Fallback if processor returned no data return { type: "archive", content: FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection), mimeType: detection.mimeType, metadata: detection.metadata, }; } /** * Process Excel/OpenDocument spreadsheet file via ExcelProcessor or OpenDocumentProcessor */ static async processXlsxFile(content, detection) { const xlsxFilename = detection.metadata.filename || "spreadsheet"; try { const ext = detection.extension?.toLowerCase(); if (ext === "ods") { const { openDocumentProcessor } = await import("../processors/document/OpenDocumentProcessor.js"); const odsResult = await openDocumentProcessor.processFile({ id: xlsxFilename, name: xlsxFilename, mimetype: detection.mimeType || "application/vnd.oasis.opendocument.spreadsheet", size: content.length, buffer: content, }); if (odsResult.success && odsResult.data) { return { type: "xlsx", content: odsResult.data.textContent || FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection), mimeType: detection.mimeType, metadata: detection.metadata, }; } } else { const { excelProcessor } = await import("../processors/document/ExcelProcessor.js"); const xlsxResult = await excelProcessor.processFile({ id: xlsxFilename, name: xlsxFilename, mimetype: detection.mimeType || "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", size: content.length, buffer: content, }); if (xlsxResult.success && xlsxResult.data) { // Build text content from worksheets const sheets = xlsxResult.data.worksheets || []; let textContent = `Spreadsheet: ${sheets.length} sheet(s), ${xlsxResult.data.totalRows} total rows\n`; for (const sheet of sheets) { textContent += `\n### Sheet: ${sheet.name}\n`; textContent += `Columns (${sheet.columnCount}): ${sheet.headers.join(", ")}\n`; textContent += `Rows: ${sheet.rowCount}\n`; // Include first rows as sample data const sampleRows = sheet.rows.slice(0, 20); const rowText = sampleRows .map((row) => row.map((c) => String(c ?? "")).join("\t")) .join("\n"); if (!rowText) { continue; } textContent += `\nData:\n${sheet.headers.join("\t")}\n${rowText}\n`; const remaining = sheet.rowCount - 20; if (remaining > 0) { textContent += `... (${remaining} more rows)\n`; } } return { type: "xlsx", content: textContent, mimeType: detection.mimeType, metadata: detection.metadata, }; } } } catch (xlsxError) { logger.warn(`[FileDetector] ExcelProcessor failed for ${xlsxFilename}, using fallback`, xlsxError instanceof Error ? xlsxError.message : String(xlsxError)); return { type: "xlsx", content: FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection, xlsxError), mimeType: detection.mimeType, metadata: detection.metadata, }; } // Fallback if processor returned no data return { type: "xlsx", content: FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection), mimeType: detection.mimeType, metadata: detection.metadata, }; } /** * Process Word/OpenDocument/RTF document via WordProcessor, OpenDocumentProcessor, or RtfProcessor */ static async processDocxFile(content, detection) { const docxFilename = detection.metadata.filename || "document"; const ext = detection.extension?.toLowerCase(); try { if (ext === "odt") { const { openDocumentProcessor } = await import("../processors/document/OpenDocumentProcessor.js"); const odtResult = await openDocumentProcessor.processFile({ id: docxFilename, name: docxFilename, mimetype: detection.mimeType || "application/vnd.oasis.opendocument.text", size: content.length, buffer: content, }); if (odtResult.success && odtResult.data) { return { type: "docx", content: odtResult.data.textContent || FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection), mimeType: detection.mimeType, metadata: detection.metadata, }; } } else if (ext === "rtf") { const { rtfProcessor } = await import("../processors/document/RtfProcessor.js");