@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
1,178 lines (1,177 loc) • 88.9 kB
JavaScript
/**
* File Type Detection Utility
* Centralized file detection for all multimodal file types
* Uses multi-strategy approach for reliable type identification
*/
import { readFile, stat } from "fs/promises";
import { getGlobalDispatcher, interceptors, request } from "undici";
// Lazy-loaded processor singletons — avoids loading heavy media deps
// (mediabunny, fluent-ffmpeg, music-metadata, adm-zip) on every generate() call.
async function getVideoProcessor() {
const mod = await import("../processors/media/VideoProcessor.js");
return mod.videoProcessor;
}
async function getAudioProcessor() {
const mod = await import("../processors/media/AudioProcessor.js");
return mod.audioProcessor;
}
async function getArchiveProcessor() {
const mod = await import("../processors/archive/ArchiveProcessor.js");
return mod.archiveProcessor;
}
import { tracers, ATTR, withSpan } from "../telemetry/index.js";
import { CSVProcessor } from "./csvProcessor.js";
import { ImageProcessor } from "./imageProcessor.js";
import { logger } from "./logger.js";
import { mimeHintToExtension, mimeHintToFileType, normalizeMimeHint, } from "./mimeTypeHints.js";
import { PDFProcessor } from "./pdfProcessor.js";
/**
* Default retry configuration constants
*/
const DEFAULT_MAX_RETRIES = 3;
const DEFAULT_RETRY_DELAY = 1000; // milliseconds
/**
* Retryable network error codes (Node.js/undici network errors)
*/
const RETRYABLE_ERROR_CODES = [
"ETIMEDOUT",
"ECONNRESET",
"ECONNREFUSED",
"ENOTFOUND",
"ENETUNREACH",
"EAI_AGAIN",
"EPIPE",
"ECONNABORTED",
"UND_ERR_CONNECT_TIMEOUT",
"UND_ERR_HEADERS_TIMEOUT",
"UND_ERR_BODY_TIMEOUT",
"UND_ERR_SOCKET",
];
/**
* Non-retryable HTTP status codes (client errors)
*/
const NON_RETRYABLE_STATUS_CODES = [400, 401, 403, 404, 405];
/**
* Retryable HTTP status codes (server errors + rate limiting)
*/
const RETRYABLE_STATUS_CODES = [429, 500, 502, 503, 504];
/**
* Check if an error is a recoverable network error that should be retried
*
* @param error - Error to check
* @returns True if error is retryable (transient network issue)
*/
function isRetryableNetworkError(error) {
if (!(error instanceof Error)) {
return false;
}
const errorMessage = error.message.toLowerCase();
// Extract error code from various error shapes
const errorWithCode = error;
const errorCode = errorWithCode.code?.toUpperCase();
// Check for retryable network error codes
if (errorCode && RETRYABLE_ERROR_CODES.includes(errorCode)) {
return true;
}
// Check HTTP status code if present in error message (e.g., "HTTP 503")
const httpStatusMatch = errorMessage.match(/http\s*(\d{3})/);
if (httpStatusMatch) {
const statusCode = parseInt(httpStatusMatch[1], 10);
if (NON_RETRYABLE_STATUS_CODES.includes(statusCode)) {
return false;
}
if (RETRYABLE_STATUS_CODES.includes(statusCode)) {
return true;
}
}
// Check error message for transient issues
const transientKeywords = [
"timeout",
"timed out",
"connection reset",
"econnreset",
"etimedout",
"network error",
"socket hang up",
"enotfound",
"getaddrinfo",
"unavailable",
"service unavailable",
];
return transientKeywords.some((keyword) => errorMessage.includes(keyword));
}
/**
* Execute an operation with automatic retry logic on transient network errors
*
* @param operation - Async function to execute
* @param options - Retry configuration options
* @returns Promise resolving to the operation result
* @throws Error if all retry attempts fail or error is non-retryable
*/
async function withRetry(operation, options = {}) {
const maxRetries = options.maxRetries ?? DEFAULT_MAX_RETRIES;
const retryDelay = options.retryDelay ?? DEFAULT_RETRY_DELAY;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await operation();
}
catch (error) {
const isRetryable = isRetryableNetworkError(error);
const isLastAttempt = attempt === maxRetries;
if (!isRetryable || isLastAttempt) {
throw error;
}
// Calculate exponential backoff delay
const delay = retryDelay * 2 ** attempt;
logger.debug("Retrying network operation after transient error", {
attempt: attempt + 1,
maxRetries,
delay,
error: error instanceof Error ? error.message : String(error),
});
await new Promise((resolve) => setTimeout(resolve, delay));
}
}
// TypeScript exhaustiveness check - should never reach here
throw new Error("Retry logic failed unexpectedly");
}
/**
* Check if text has JSON markers (starts with { or [ and ends with corresponding closing bracket)
*/
function hasJsonMarkers(text) {
const trimmed = text.trim();
if (!trimmed) {
return false;
}
const firstChar = trimmed[0];
const lastChar = trimmed[trimmed.length - 1];
const hasMatchingBrackets = (firstChar === "{" && lastChar === "}") ||
(firstChar === "[" && lastChar === "]");
if (!hasMatchingBrackets) {
return false;
}
try {
JSON.parse(trimmed);
return true;
}
catch {
return false;
}
}
/**
* Format file size in human-readable units
*/
function formatFileSize(bytes) {
if (bytes < 1024) {
return `${bytes} bytes`;
}
if (bytes < 1024 * 1024) {
return `${(bytes / 1024).toFixed(2)} KB`;
}
if (bytes < 1024 * 1024 * 1024) {
return `${(bytes / (1024 * 1024)).toFixed(2)} MB`;
}
return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
}
/**
* Centralized file type detection and processing
*
* @example
* ```typescript
* // Auto-detect and process any file
* const result = await FileDetector.detectAndProcess("data.csv");
* logger.info(result.type); // 'csv'
* ```
*/
export class FileDetector {
// FD-017: Replace hardcoded timeouts with constants.
// These default ensure consistent timeout behavior across all file-detection logic.
static DEFAULT_NETWORK_TIMEOUT = 30000; // 30 seconds
static DEFAULT_HEAD_TIMEOUT = 5000; // 5 seconds
/**
* Auto-detect file type and process in one call
*
* Runs detection strategies in priority order:
* 1. MagicBytesStrategy (95% confidence) - Binary file headers
* 2. MimeTypeStrategy (85% confidence) - HTTP Content-Type for URLs
* 3. ExtensionStrategy (70% confidence) - File extension
* 4. ContentHeuristicStrategy (75% confidence) - Content analysis
*
* @param input - File path, URL, Buffer, or data URI
* @param options - Detection and processing options
* @returns Processed file result with type and content
*/
static async detectAndProcess(input, options) {
// Derive filename and size for tracing before detection runs
const inputFilename = FileDetector.deriveInputFilename(input);
const inputSizeBytes = FileDetector.deriveInputSize(input);
return withSpan({
name: "neurolink.file.detect_and_process",
tracer: tracers.file,
attributes: {
[ATTR.FILE_NAME]: inputFilename,
[ATTR.FILE_SIZE_BYTES]: inputSizeBytes,
},
}, async (span) => {
const detection = await FileDetector.detect(input, options);
span.setAttribute(ATTR.FILE_CATEGORY, detection.type);
span.setAttribute(ATTR.FILE_MIMETYPE, detection.mimeType || "unknown");
span.setAttribute(ATTR.FILE_CONFIDENCE, detection.metadata.confidence);
logger.info(`[NEUROLINK] File detected: ${inputFilename} (${detection.mimeType || "unknown"}, ${formatFileSize(inputSizeBytes)}) → category: ${detection.type}`);
// FD-018: Comprehensive fallback parsing for extension-less files
if (options?.allowedTypes &&
!options.allowedTypes.includes(detection.type)) {
const content = await FileDetector.loadContent(input, detection, options);
const errors = [];
for (const allowedType of options.allowedTypes) {
try {
const result = await FileDetector.tryFallbackParsing(content, allowedType, options);
if (result) {
logger.info(`[FileDetector] ✅ ${allowedType.toUpperCase()} fallback successful`);
const outputLength = typeof result.content === "string"
? result.content.length
: result.content?.length || 0;
span.setAttribute(ATTR.FILE_OUTPUT_LENGTH, outputLength);
span.setAttribute(ATTR.FILE_SUCCESS, true);
span.setAttribute(ATTR.FILE_PROCESSOR_USED, `fallback:${allowedType}`);
logger.info(`[NEUROLINK] File processed: ${inputFilename} → ${outputLength} bytes output (fallback: ${allowedType})`);
return result;
}
}
catch (error) {
const errorMsg = error instanceof Error ? error.message : String(error);
errors.push(`${allowedType}: ${errorMsg}`);
logger.debug(`[FileDetector] ${allowedType} fallback failed: ${errorMsg}`);
}
}
logger.warn(`[FileDetector] All fallback parsing failed for type "${detection.type}". ` +
`Attempted: ${options.allowedTypes.join(", ")}. Falling through to universal handler.`);
const csvOptions = options?.csvOptions;
const result = await FileDetector.processFile(content, detection, csvOptions, options?.provider);
FileDetector.setFileResultSpanAttributes(span, result, inputFilename, detection.type);
return result;
}
const content = await FileDetector.loadContent(input, detection, options);
const csvOptions = options?.csvOptions;
const result = await FileDetector.processFile(content, detection, csvOptions, options?.provider);
FileDetector.setFileResultSpanAttributes(span, result, inputFilename, detection.type);
return result;
});
}
/**
* Set span attributes and log after file processing completes.
*/
static setFileResultSpanAttributes(span, result, filename, processorType) {
const outputLength = typeof result.content === "string"
? result.content.length
: result.content?.length || 0;
const hasImages = Array.isArray(result.images)
? result.images.length > 0
: false;
const imageCount = Array.isArray(result.images)
? result.images.length
: 0;
span.setAttribute(ATTR.FILE_OUTPUT_LENGTH, outputLength);
span.setAttribute(ATTR.FILE_SUCCESS, true);
span.setAttribute(ATTR.FILE_PROCESSOR_USED, processorType);
span.setAttribute(ATTR.FILE_HAS_IMAGES, hasImages);
span.setAttribute(ATTR.FILE_IMAGE_COUNT, imageCount);
logger.info(`[NEUROLINK] File processed: ${filename} → ${outputLength} bytes output` +
(imageCount > 0 ? ` + ${imageCount} image(s)` : "") +
` (processor: ${processorType})`);
}
/**
* Derive a human-readable filename from FileInput for tracing.
*/
static deriveInputFilename(input) {
if (typeof input === "string") {
if (input.startsWith("data:")) {
return "data-uri";
}
if (input.startsWith("http")) {
try {
return new URL(input).pathname.split("/").pop() || "url-file";
}
catch {
return "url-file";
}
}
// File path
return input.split("/").pop() || input.split("\\").pop() || "file";
}
if (Buffer.isBuffer(input)) {
return "buffer";
}
return "unknown-input";
}
/**
* Derive byte size from FileInput for tracing.
*/
static deriveInputSize(input) {
if (Buffer.isBuffer(input)) {
return input.length;
}
if (typeof input === "string") {
if (input.startsWith("data:")) {
// Rough estimate: base64 is ~4/3 of raw
const base64Part = input.split(",")[1];
return base64Part ? Math.floor((base64Part.length * 3) / 4) : 0;
}
return input.length; // path or URL string length (not file size)
}
return 0;
}
/**
* Classify a FileInput into the FileSource enum used by downstream
* loaders. Keeps the mimetype-hint short-circuit in detect() able to
* produce a valid FileDetectionResult without re-implementing the
* source-inference rules scattered across loadContent().
*/
static deriveInputSource(input) {
if (Buffer.isBuffer(input)) {
return "buffer";
}
if (typeof input === "string") {
if (input.startsWith("data:")) {
return "datauri";
}
if (input.startsWith("http://") || input.startsWith("https://")) {
return "url";
}
return "path";
}
return "buffer";
}
/**
* Try fallback parsing for a specific file type
* Used when file detection returns "unknown" but we want to try parsing anyway
*/
static async tryFallbackParsing(content, fileType, options) {
logger.info(`[FileDetector] Attempting ${fileType.toUpperCase()} fallback parsing`);
switch (fileType) {
case "csv": {
// Try CSV parsing
const csvOptions = options?.csvOptions;
const result = await CSVProcessor.process(content, csvOptions);
logger.info(`[FileDetector] CSV fallback: ${result.metadata?.rowCount || 0} rows, ${result.metadata?.columnCount || 0} columns`);
return result;
}
case "text": {
// Try text parsing - check if content is valid UTF-8 text
const textContent = content.toString("utf-8");
// Validate it's actually text (no null bytes, mostly printable)
if (FileDetector.isValidText(textContent)) {
return {
type: "text",
content: textContent,
mimeType: FileDetector.guessTextMimeType(textContent),
metadata: {
confidence: 70,
size: content.length,
},
};
}
throw new Error("Content does not appear to be valid text");
}
case "image": {
// Image requires magic bytes - can't fallback without detection
throw new Error("Image type requires binary detection, cannot fallback parse");
}
case "pdf": {
// PDF requires magic bytes - can't fallback without detection
throw new Error("PDF type requires binary detection, cannot fallback parse");
}
case "audio": {
// Audio requires magic bytes - can't fallback without detection
throw new Error("Audio type requires binary detection, cannot fallback parse");
}
case "video": {
// Video requires magic bytes - can't fallback without detection
throw new Error("Video type requires binary detection, cannot fallback parse");
}
case "archive": {
// Archive requires magic bytes - can't fallback without detection
throw new Error("Archive type requires binary detection, cannot fallback parse");
}
case "xlsx": {
// Document formats require binary detection
throw new Error("Excel type requires binary detection, cannot fallback parse");
}
case "docx": {
throw new Error("Word type requires binary detection, cannot fallback parse");
}
case "pptx": {
throw new Error("PowerPoint type requires binary detection, cannot fallback parse");
}
case "svg": {
// SVG can be detected from text content
const svgContent = content.toString("utf-8");
if (svgContent.includes("<svg") && svgContent.includes("</svg>")) {
return {
type: "svg",
content: svgContent,
mimeType: "image/svg+xml",
metadata: {
confidence: 70,
size: content.length,
},
};
}
throw new Error("Content does not appear to be valid SVG");
}
default:
return null;
}
}
/**
* Check if content is valid text (UTF-8, mostly printable)
*/
static isValidText(content) {
// Check for null bytes which indicate binary content
if (content.includes("\0")) {
return false;
}
// Check if content has reasonable amount of printable characters
let printableCount = 0;
for (let i = 0; i < content.length; i++) {
const code = content.charCodeAt(i);
if ((code >= 32 && code < 127) || // ASCII printable
code === 9 || // Tab
code === 10 || // Newline
code === 13 || // Carriage return
code > 127 // Unicode (non-ASCII)
) {
printableCount++;
}
}
// At least 90% should be printable
return printableCount / content.length >= 0.9;
}
/**
* Guess the MIME type for text content based on content patterns
*/
static guessTextMimeType(content) {
const trimmed = content.trim();
// Check for JSON
if ((trimmed.startsWith("{") && trimmed.endsWith("}")) ||
(trimmed.startsWith("[") && trimmed.endsWith("]"))) {
try {
JSON.parse(trimmed);
return "application/json";
}
catch {
// Not valid JSON, continue checking
}
}
// Check for XML/HTML using stricter detection
if (FileDetector.looksLikeXMLStrict(trimmed)) {
const isHTML = trimmed.includes("<!DOCTYPE html") ||
trimmed.toLowerCase().includes("<html") ||
trimmed.includes("<head") ||
trimmed.includes("<body");
return isHTML ? "text/html" : "application/xml";
}
// Check for YAML using robust multi-indicator detection
if (FileDetector.looksLikeYAMLStrict(trimmed)) {
return "application/yaml";
}
// Default to plain text
return "text/plain";
}
/**
* Strict YAML detection for guessTextMimeType
* Similar to ContentHeuristicStrategy but requires at least 2 indicators
* to avoid false positives from simple key: value patterns
*/
static looksLikeYAMLStrict(text) {
if (text.length === 0) {
return false;
}
const lines = text.split("\n");
// For single-line content, only --- or ... qualify as YAML
if (lines.length === 1) {
return text === "---" || text === "...";
}
// Collect YAML indicators (requires at least 2 for positive detection)
const indicators = [];
// Indicator 1: Document start marker (---)
indicators.push(text.startsWith("---"));
// Indicator 2: Document end marker (...)
indicators.push(/^\.\.\.$|[\n]\.\.\.$/.test(text));
// Indicator 3: YAML list items (- followed by space)
indicators.push(/^[\s]*-\s+[^-]/m.test(text));
// Indicator 4: Multiple key-value pairs (at least 2)
const keyValuePattern = /^[\s]*[a-zA-Z_][a-zA-Z0-9_-]*:\s*(.+)$/;
const keyValueMatches = lines.filter((line) => keyValuePattern.test(line)).length;
indicators.push(keyValueMatches >= 2);
// Require at least 2 indicators for confident YAML detection
const matchCount = indicators.filter(Boolean).length;
return matchCount >= 2;
}
/**
* Strict XML detection for guessTextMimeType
* Ensures content has proper XML declaration or valid tag structure with closing tags
* Prevents false positives from arbitrary content starting with <
*/
static looksLikeXMLStrict(content) {
// XML declaration is a definitive marker
if (content.startsWith("<?xml")) {
return true;
}
// Must start with < for XML/HTML
if (!content.startsWith("<")) {
return false;
}
// Check for HTML DOCTYPE declaration
if (content.includes("<!DOCTYPE html")) {
return true;
}
// Must have valid opening tag structure: <tagname
// Not just any < character like "< something"
const hasValidOpeningTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?>/;
if (!hasValidOpeningTag.test(content)) {
return false;
}
// Must have at least one closing tag or self-closing tag to be valid XML/HTML
const hasClosingTag = /<\/[a-zA-Z][a-zA-Z0-9-]*>/.test(content);
const hasSelfClosingTag = /<[a-zA-Z][a-zA-Z0-9-]*(?:\s[^>]*)?\s*\/\s*>/.test(content);
return hasClosingTag || hasSelfClosingTag;
}
/**
* Detect file type using multi-strategy approach
* Stops at first strategy with confidence >= threshold (default: 80%)
*/
static async detect(input, options) {
// Short-circuit on a trustworthy caller-provided mimetype hint. This is
// the eager-path counterpart to FileReferenceRegistry.register()'s hint
// handling — necessary for tiny files (<= TINY_MAX) that skip the lazy
// registry path. normalizeMimeHint drops "application/octet-stream" so a
// caller cannot hide real content behind the opaque sentinel.
const hintMime = normalizeMimeHint(options?.mimetypeHint);
if (hintMime) {
const type = mimeHintToFileType(hintMime);
if (type) {
const ext = mimeHintToExtension(hintMime);
const result = {
type,
mimeType: hintMime,
extension: ext || null,
source: FileDetector.deriveInputSource(input),
metadata: {
confidence: 95,
filename: FileDetector.deriveInputFilename(input),
size: FileDetector.deriveInputSize(input),
},
};
logger.info(`[FileDetector] Type: ${type} (95%, from mimetype hint: ${hintMime})`);
return result;
}
}
const confidenceThreshold = options?.confidenceThreshold ?? 80;
const strategies = [
new MagicBytesStrategy(),
new MimeTypeStrategy(),
new ExtensionStrategy(),
new ContentHeuristicStrategy(),
];
let best = null;
for (const strategy of strategies) {
const result = await strategy.detect(input);
if (!best || result.metadata.confidence > best.metadata.confidence) {
best = result;
}
if (result.metadata.confidence >= confidenceThreshold) {
logger.info(`[FileDetector] Type: ${result.type} (${result.metadata.confidence}%)`);
return result;
}
}
logger.warn(`[FileDetector] Low confidence: ${best?.type ?? "unknown"} (${best?.metadata.confidence ?? 0}%)`);
return best;
}
/**
* Load file content from various sources
*/
static async loadContent(input, detection, options) {
let source = detection.source;
if (source === "buffer" && !Buffer.isBuffer(input)) {
if (typeof input === "string") {
if (input.startsWith("data:")) {
source = "datauri";
}
else if (input.startsWith("http://") ||
input.startsWith("https://")) {
source = "url";
}
else {
source = "path";
}
}
}
switch (source) {
case "url":
return await FileDetector.loadFromURL(input, options);
case "path":
return await FileDetector.loadFromPath(input, options);
case "buffer":
return input;
case "datauri":
return FileDetector.loadFromDataURI(input);
default:
throw new Error(`Unknown source: ${source}`);
}
}
/**
* SDK-8: Format an informative placeholder when a file processor fails.
* Instead of bare "[Video file: name]" strings, include size, format, and
* the reason for failure so the LLM can acknowledge the attachment.
*/
static formatInformativePlaceholder(typeName, filename, content, detection, error) {
const sizeStr = content.length < 1024
? `${content.length} bytes`
: content.length < 1024 * 1024
? `${(content.length / 1024).toFixed(1)} KB`
: `${(content.length / (1024 * 1024)).toFixed(1)} MB`;
const errorMsg = error instanceof Error
? error.message
: error
? String(error)
: "Processing returned no usable content";
return (`[${typeName} File: "${filename}"]\n` +
`Size: ${sizeStr}\n` +
`Format: ${detection.mimeType || "unknown"}\n` +
`Error: Could not extract content (${errorMsg}).\n` +
`The file was attached but could not be fully analyzed.`);
}
/**
* Extract metadata and printable strings from an unrecognized binary file.
* This is the "extract what you can" path for unknown file types.
*
* Extracts:
* - File size (human-readable)
* - MIME type / detected format
* - First N bytes as hex dump (for identification)
* - Printable ASCII/UTF-8 strings found in the binary (like `strings` command)
* - Known file signatures that we don't have full processors for
*
* @param content Raw file buffer
* @param detection Detection result (may be "unknown")
* @param filename Original filename (if known)
* @returns Formatted text summary suitable for LLM consumption
*/
static extractBinaryMetadata(content, detection, filename) {
const parts = [];
// Header
const ext = detection.extension
? `.${detection.extension}`
: filename.includes(".")
? filename.slice(filename.lastIndexOf("."))
: "";
const typeLabel = ext
? `${ext.toUpperCase().slice(1)} file`
: "Binary file";
parts.push(`[${typeLabel}: "${filename}"]`);
// Basic metadata
const sizeStr = formatFileSize(content.length);
parts.push(`Size: ${sizeStr}`);
if (detection.mimeType &&
detection.mimeType !== "application/octet-stream") {
parts.push(`Format: ${detection.mimeType}`);
}
// Known binary signature identification (broader than our processing capabilities)
const sigLabel = FileDetector.identifyBinarySignature(content);
if (sigLabel) {
parts.push(`Identified as: ${sigLabel}`);
}
// Hex dump of first 32 bytes for identification
const hexPreview = content
.subarray(0, Math.min(32, content.length))
.toString("hex")
.match(/.{1,2}/g)
?.join(" ");
if (hexPreview) {
parts.push(`Header bytes: ${hexPreview}`);
}
// Extract printable strings (similar to Unix `strings` command)
const strings = FileDetector.extractPrintableStrings(content, 4, 50);
if (strings.length > 0) {
parts.push(`\nEmbedded text found (${strings.length} string${strings.length > 1 ? "s" : ""}):`);
for (const s of strings) {
parts.push(` "${s}"`);
}
}
parts.push(`\nThis file was attached but its format is not fully supported for content extraction.`);
parts.push(`The above metadata and any embedded text have been extracted for context.`);
return parts.join("\n");
}
/**
* Identify known binary file signatures beyond what we can process.
* Returns a human-readable description, or null if unrecognized.
*/
static identifyBinarySignature(buf) {
if (buf.length < 4) {
return null;
}
// SQLite: "SQLite format 3\0"
if (buf.length >= 16 &&
buf.subarray(0, 15).toString("ascii") === "SQLite format 3") {
return "SQLite database";
}
// WOFF: "wOFF"
if (buf[0] === 0x77 &&
buf[1] === 0x4f &&
buf[2] === 0x46 &&
buf[3] === 0x46) {
return "WOFF font";
}
// WOFF2: "wOF2"
if (buf[0] === 0x77 &&
buf[1] === 0x4f &&
buf[2] === 0x46 &&
buf[3] === 0x32) {
return "WOFF2 font";
}
// TrueType/OpenType: starts with 0x00010000 or "OTTO"
if ((buf[0] === 0x00 &&
buf[1] === 0x01 &&
buf[2] === 0x00 &&
buf[3] === 0x00) ||
(buf[0] === 0x4f && buf[1] === 0x54 && buf[2] === 0x54 && buf[3] === 0x4f)) {
return "TrueType/OpenType font";
}
// ELF executable: \x7fELF
if (buf[0] === 0x7f &&
buf[1] === 0x45 &&
buf[2] === 0x4c &&
buf[3] === 0x46) {
return "ELF executable/library";
}
// Mach-O: 0xFEEDFACE or 0xFEEDFACF (64-bit) or 0xCAFEBABE (universal)
if ((buf[0] === 0xfe &&
buf[1] === 0xed &&
buf[2] === 0xfa &&
buf[3] === 0xce) ||
(buf[0] === 0xfe &&
buf[1] === 0xed &&
buf[2] === 0xfa &&
buf[3] === 0xcf) ||
(buf[0] === 0xca && buf[1] === 0xfe && buf[2] === 0xba && buf[3] === 0xbe)) {
return "Mach-O executable/library";
}
// PE/Windows executable: "MZ"
if (buf[0] === 0x4d && buf[1] === 0x5a) {
return "Windows PE executable/DLL";
}
// WebAssembly: "\0asm"
if (buf[0] === 0x00 &&
buf[1] === 0x61 &&
buf[2] === 0x73 &&
buf[3] === 0x6d) {
return "WebAssembly binary";
}
// DWG (AutoCAD): starts with "AC10"
if (buf[0] === 0x41 &&
buf[1] === 0x43 &&
buf[2] === 0x31 &&
buf[3] === 0x30) {
return "AutoCAD DWG drawing";
}
// BZ2: "BZ" + 'h'
if (buf[0] === 0x42 && buf[1] === 0x5a && buf[2] === 0x68) {
return "BZip2 compressed archive";
}
// XZ: 0xFD + "7zXZ"
if (buf.length >= 6 &&
buf[0] === 0xfd &&
buf[1] === 0x37 &&
buf[2] === 0x7a &&
buf[3] === 0x58 &&
buf[4] === 0x5a &&
buf[5] === 0x00) {
return "XZ compressed archive";
}
// 7z: "7z" + BC AF 27 1C
if (buf.length >= 6 &&
buf[0] === 0x37 &&
buf[1] === 0x7a &&
buf[2] === 0xbc &&
buf[3] === 0xaf &&
buf[4] === 0x27 &&
buf[5] === 0x1c) {
return "7-Zip archive";
}
// ISO 9660: "CD001" at offset 32769
if (buf.length > 32773 &&
buf.subarray(32769, 32774).toString("ascii") === "CD001") {
return "ISO 9660 disc image";
}
// Apache Parquet: "PAR1"
if (buf[0] === 0x50 &&
buf[1] === 0x41 &&
buf[2] === 0x52 &&
buf[3] === 0x31) {
return "Apache Parquet data file";
}
// Protocol Buffers compiled: (no fixed magic, skip)
// TIFF (already handled as image, but including for completeness)
if ((buf[0] === 0x49 &&
buf[1] === 0x49 &&
buf[2] === 0x2a &&
buf[3] === 0x00) ||
(buf[0] === 0x4d && buf[1] === 0x4d && buf[2] === 0x00 && buf[3] === 0x2a)) {
return "TIFF image";
}
// ICO: 00 00 01 00
if (buf[0] === 0x00 &&
buf[1] === 0x00 &&
buf[2] === 0x01 &&
buf[3] === 0x00) {
return "ICO icon image";
}
return null;
}
/**
* Extract printable ASCII strings from a binary buffer.
* Similar to the Unix `strings` utility.
*
* @param buf Buffer to scan
* @param minLength Minimum string length to include (default 4)
* @param maxStrings Maximum number of strings to return (default 50)
* @returns Array of printable strings found in the binary
*/
static extractPrintableStrings(buf, minLength = 4, maxStrings = 50) {
const strings = [];
let current = "";
// Only scan first 64KB to avoid huge processing time
const scanLimit = Math.min(buf.length, 64 * 1024);
for (let i = 0; i < scanLimit; i++) {
const byte = buf[i];
// Printable ASCII range (space through tilde) plus tab
if ((byte >= 0x20 && byte <= 0x7e) || byte === 0x09) {
current += String.fromCharCode(byte);
}
else {
if (current.length >= minLength) {
strings.push(current);
if (strings.length >= maxStrings) {
break;
}
}
current = "";
}
}
// Flush last string
if (current.length >= minLength && strings.length < maxStrings) {
strings.push(current);
}
return strings;
}
/**
* Route to appropriate processor
*/
static async processFile(content, detection, options, provider) {
switch (detection.type) {
case "csv":
// Pass original extension through to CSV processor; if detection has none,
// fall back to any extension provided in csvOptions.
return await CSVProcessor.process(content, {
...options,
extension: detection.extension ?? options?.extension,
});
case "image":
return await ImageProcessor.process(content);
case "pdf":
return await PDFProcessor.process(content, { provider });
case "svg":
// SVG is processed as text content (sanitized XML markup)
// AI providers don't support SVG as image format, so we extract text content
return await FileDetector.processSvgAsText(content, detection);
case "video":
return await FileDetector.processVideoFile(content, detection);
case "audio":
return await FileDetector.processAudioFile(content, detection);
case "archive":
return await FileDetector.processArchiveFile(content, detection);
case "xlsx":
return await FileDetector.processXlsxFile(content, detection);
case "docx":
return await FileDetector.processDocxFile(content, detection);
case "pptx":
return await FileDetector.processPptxFile(content, detection);
case "text":
return {
type: "text",
content: content.toString("utf-8"),
mimeType: detection.mimeType || "text/plain",
metadata: detection.metadata,
};
default: {
// Graceful degradation: try to treat unknown types as text if content is valid UTF-8
const unknownContent = content.toString("utf-8");
if (FileDetector.isValidText(unknownContent)) {
logger.warn(`[FileDetector] Unknown type "${detection.type}", treating as text`);
return {
type: "text",
content: unknownContent,
mimeType: detection.mimeType || "text/plain",
metadata: detection.metadata,
};
}
// Binary file that we can't fully process — extract what we can
// (metadata, printable strings, signature identification)
const filename = detection.metadata.filename || "file";
logger.warn(`[FileDetector] Unknown binary type "${detection.type}", extracting metadata for "${filename}"`);
return {
type: "unknown",
content: FileDetector.extractBinaryMetadata(content, detection, filename),
mimeType: detection.mimeType || "application/octet-stream",
metadata: detection.metadata,
};
}
}
}
/**
* Process video file: extract metadata, keyframes, and subtitles via VideoProcessor
*/
static async processVideoFile(content, detection) {
const videoFilename = detection.metadata.filename || "video";
try {
const videoResult = await (await getVideoProcessor()).processFile({
id: videoFilename,
name: videoFilename,
mimetype: detection.mimeType || "video/mp4",
size: content.length,
buffer: content,
});
if (videoResult.success && videoResult.data) {
return {
type: "video",
content: videoResult.data.textContent ||
FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection),
mimeType: detection.mimeType,
images: videoResult.data.keyframes && videoResult.data.keyframes.length > 0
? videoResult.data.keyframes
: undefined,
metadata: {
...detection.metadata,
frameCount: videoResult.data.frameCount,
hasKeyframes: videoResult.data.hasKeyframes,
},
};
}
}
catch (videoError) {
logger.warn(`[FileDetector] VideoProcessor failed for ${videoFilename}, using fallback`, videoError instanceof Error ? videoError.message : String(videoError));
return {
type: "video",
content: FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection, videoError),
mimeType: detection.mimeType,
metadata: detection.metadata,
};
}
// Fallback if processor returned no data
return {
type: "video",
content: FileDetector.formatInformativePlaceholder("Video", videoFilename, content, detection),
mimeType: detection.mimeType,
metadata: detection.metadata,
};
}
/**
* Process audio file: extract metadata, tags, and cover art via AudioProcessor
*/
static async processAudioFile(content, detection) {
const audioFilename = detection.metadata.filename || "audio";
try {
const audioResult = await (await getAudioProcessor()).processFile({
id: audioFilename,
name: audioFilename,
mimetype: detection.mimeType || "audio/mpeg",
size: content.length,
buffer: content,
});
if (audioResult.success && audioResult.data) {
return {
type: "audio",
content: audioResult.data.textContent ||
FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection),
mimeType: detection.mimeType,
// Surface embedded cover art as an image content block
images: audioResult.data.coverArt
? [audioResult.data.coverArt]
: undefined,
metadata: detection.metadata,
};
}
}
catch (audioError) {
logger.warn(`[FileDetector] AudioProcessor failed for ${audioFilename}, using fallback`, audioError instanceof Error ? audioError.message : String(audioError));
return {
type: "audio",
content: FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection, audioError),
mimeType: detection.mimeType,
metadata: detection.metadata,
};
}
// Fallback if processor returned no data
return {
type: "audio",
content: FileDetector.formatInformativePlaceholder("Audio", audioFilename, content, detection),
mimeType: detection.mimeType,
metadata: detection.metadata,
};
}
/**
* Process archive file: list contents and extract metadata via ArchiveProcessor
*/
static async processArchiveFile(content, detection) {
const archiveFilename = detection.metadata.filename || "archive";
try {
const archiveResult = await (await getArchiveProcessor()).processFile({
id: archiveFilename,
name: archiveFilename,
mimetype: detection.mimeType || "application/zip",
size: content.length,
buffer: content,
});
if (archiveResult.success && archiveResult.data) {
return {
type: "archive",
content: archiveResult.data.textContent ||
FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection),
mimeType: detection.mimeType,
metadata: detection.metadata,
};
}
}
catch (archiveError) {
logger.warn(`[FileDetector] ArchiveProcessor failed for ${archiveFilename}, using fallback`, archiveError instanceof Error
? archiveError.message
: String(archiveError));
return {
type: "archive",
content: FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection, archiveError),
mimeType: detection.mimeType,
metadata: detection.metadata,
};
}
// Fallback if processor returned no data
return {
type: "archive",
content: FileDetector.formatInformativePlaceholder("Archive", archiveFilename, content, detection),
mimeType: detection.mimeType,
metadata: detection.metadata,
};
}
/**
* Process Excel/OpenDocument spreadsheet file via ExcelProcessor or OpenDocumentProcessor
*/
static async processXlsxFile(content, detection) {
const xlsxFilename = detection.metadata.filename || "spreadsheet";
try {
const ext = detection.extension?.toLowerCase();
if (ext === "ods") {
const { openDocumentProcessor } = await import("../processors/document/OpenDocumentProcessor.js");
const odsResult = await openDocumentProcessor.processFile({
id: xlsxFilename,
name: xlsxFilename,
mimetype: detection.mimeType ||
"application/vnd.oasis.opendocument.spreadsheet",
size: content.length,
buffer: content,
});
if (odsResult.success && odsResult.data) {
return {
type: "xlsx",
content: odsResult.data.textContent ||
FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection),
mimeType: detection.mimeType,
metadata: detection.metadata,
};
}
}
else {
const { excelProcessor } = await import("../processors/document/ExcelProcessor.js");
const xlsxResult = await excelProcessor.processFile({
id: xlsxFilename,
name: xlsxFilename,
mimetype: detection.mimeType ||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
size: content.length,
buffer: content,
});
if (xlsxResult.success && xlsxResult.data) {
// Build text content from worksheets
const sheets = xlsxResult.data.worksheets || [];
let textContent = `Spreadsheet: ${sheets.length} sheet(s), ${xlsxResult.data.totalRows} total rows\n`;
for (const sheet of sheets) {
textContent += `\n### Sheet: ${sheet.name}\n`;
textContent += `Columns (${sheet.columnCount}): ${sheet.headers.join(", ")}\n`;
textContent += `Rows: ${sheet.rowCount}\n`;
// Include first rows as sample data
const sampleRows = sheet.rows.slice(0, 20);
const rowText = sampleRows
.map((row) => row.map((c) => String(c ?? "")).join("\t"))
.join("\n");
if (!rowText) {
continue;
}
textContent += `\nData:\n${sheet.headers.join("\t")}\n${rowText}\n`;
const remaining = sheet.rowCount - 20;
if (remaining > 0) {
textContent += `... (${remaining} more rows)\n`;
}
}
return {
type: "xlsx",
content: textContent,
mimeType: detection.mimeType,
metadata: detection.metadata,
};
}
}
}
catch (xlsxError) {
logger.warn(`[FileDetector] ExcelProcessor failed for ${xlsxFilename}, using fallback`, xlsxError instanceof Error ? xlsxError.message : String(xlsxError));
return {
type: "xlsx",
content: FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection, xlsxError),
mimeType: detection.mimeType,
metadata: detection.metadata,
};
}
// Fallback if processor returned no data
return {
type: "xlsx",
content: FileDetector.formatInformativePlaceholder("Spreadsheet", xlsxFilename, content, detection),
mimeType: detection.mimeType,
metadata: detection.metadata,
};
}
/**
* Process Word/OpenDocument/RTF document via WordProcessor, OpenDocumentProcessor, or RtfProcessor
*/
static async processDocxFile(content, detection) {
const docxFilename = detection.metadata.filename || "document";
const ext = detection.extension?.toLowerCase();
try {
if (ext === "odt") {
const { openDocumentProcessor } = await import("../processors/document/OpenDocumentProcessor.js");
const odtResult = await openDocumentProcessor.processFile({
id: docxFilename,
name: docxFilename,
mimetype: detection.mimeType || "application/vnd.oasis.opendocument.text",
size: content.length,
buffer: content,
});
if (odtResult.success && odtResult.data) {
return {
type: "docx",
content: odtResult.data.textContent ||
FileDetector.formatInformativePlaceholder("Document", docxFilename, content, detection),
mimeType: detection.mimeType,
metadata: detection.metadata,
};
}
}
else if (ext === "rtf") {
const { rtfProcessor } = await import("../processors/document/RtfProcessor.js");