UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

1,180 lines (1,179 loc) 55.4 kB
/** * Archive Processor * * Handles downloading, validating, and processing archive files (ZIP, TAR, TAR.GZ, GZ). * Extracts file listings with metadata for AI consumption without recursively * processing individual entries through other processors (Phase 1). * * Key features: * - ZIP support via adm-zip (dynamic import) * - TAR / TAR.GZ support via tar-stream (dynamic import) * - Plain GZ support via Node zlib * - Comprehensive security validation (path traversal, zip bombs, symlinks, encryption) * - In-memory extraction with configurable size limits * - Structured text output for LLM consumption * * @module processors/archive/ArchiveProcessor * * @example * ```typescript * import { archiveProcessor, processArchive, isArchiveFile } from "./ArchiveProcessor.js"; * * // Check if a file is an archive * if (isArchiveFile(fileInfo.mimetype, fileInfo.name)) { * const result = await processArchive(fileInfo, { * authHeaders: { Authorization: "Bearer token" }, * }); * * if (result.success) { * console.log(`Format: ${result.data.archiveMetadata.format}`); * console.log(`Entries: ${result.data.archiveMetadata.totalEntries}`); * for (const entry of result.data.entries) { * console.log(` ${entry.name} (${entry.uncompressedSize} bytes)`); * } * } * } * ``` */ import * as path from "path"; import { BaseFileProcessor } from "../base/BaseFileProcessor.js"; import { SIZE_LIMITS_MB } from "../config/index.js"; import { FileErrorCode } from "../errors/index.js"; // ============================================================================= // TYPES // ============================================================================= // ============================================================================= // SECURITY CONFIGURATION // ============================================================================= /** * Security limits for archive processing. * These values are intentionally conservative to prevent resource exhaustion * and common archive-based attacks (zip bombs, path traversal, etc.). */ const ARCHIVE_SECURITY = { /** Maximum number of entries allowed in a single archive */ MAX_ENTRIES: 1000, /** Maximum total decompressed size allowed (100 MB) */ MAX_DECOMPRESSED_SIZE: 100 * 1024 * 1024, /** Maximum size of any single file within the archive (20 MB) */ MAX_SINGLE_FILE_SIZE: 20 * 1024 * 1024, /** Maximum compression ratio before flagging as potential zip bomb */ MAX_COMPRESSION_RATIO: 100, /** * Maximum archive nesting depth. * Phase 1 only lists contents (no recursive extraction), so depth is 1. */ MAX_NESTING_DEPTH: 1, /** Maximum path length for any entry name */ MAX_PATH_LENGTH: 255, /** Whether to allow encrypted archive entries */ ALLOW_ENCRYPTED: false, /** Whether to allow symbolic link entries */ ALLOW_SYMLINKS: false, }; /** * Archive processor configuration constants. */ const ARCHIVE_CONFIG = { /** Maximum archive file size in MB (uses centralized constant from sizeLimits) */ MAX_SIZE_MB: SIZE_LIMITS_MB.ARCHIVE_MAX_MB, /** Processing timeout in milliseconds (60 seconds) */ TIMEOUT_MS: 60_000, /** Maximum number of entries to extract content from (Phase 2 sub-processing) */ MAX_EXTRACT_ENTRIES: 20, /** Maximum size of a single entry to extract for content processing (1 MB) */ MAX_EXTRACT_ENTRY_SIZE: 1 * 1024 * 1024, /** Maximum total extracted content size across all entries (5 MB) */ MAX_TOTAL_EXTRACT_SIZE: 5 * 1024 * 1024, /** File extensions eligible for content extraction inside archives */ EXTRACTABLE_EXTENSIONS: new Set([ ".ts", ".js", ".tsx", ".jsx", ".py", ".java", ".go", ".rs", ".rb", ".php", ".c", ".cpp", ".h", ".hpp", ".cs", ".swift", ".kt", ".scala", ".sh", ".bash", ".txt", ".md", ".json", ".yaml", ".yml", ".xml", ".html", ".css", ".sql", ".toml", ".ini", ".cfg", ".env", ".csv", ".log", ".conf", ".dockerfile", ".makefile", ".gitignore", ".editorconfig", ]), }; // ============================================================================= // SUPPORTED FORMATS // ============================================================================= /** MIME types recognized as archive formats */ const SUPPORTED_ARCHIVE_MIME_TYPES = [ "application/zip", "application/x-zip-compressed", "application/x-zip", "application/x-tar", "application/x-gtar", "application/gzip", "application/x-gzip", "application/x-compressed-tar", "application/x-bzip2", "application/java-archive", ]; /** File extensions recognized as archive formats */ const SUPPORTED_ARCHIVE_EXTENSIONS = [".zip", ".tar", ".gz", ".tgz", ".bz2", ".tbz2", ".jar"]; // ============================================================================= // MAGIC BYTE SIGNATURES // ============================================================================= /** * Magic byte signatures for archive format detection. * Used alongside file extension for robust format identification. */ const MAGIC_BYTES = { /** ZIP/JAR: PK\x03\x04 */ ZIP: [0x50, 0x4b, 0x03, 0x04], /** ZIP empty archive: PK\x05\x06 */ ZIP_EMPTY: [0x50, 0x4b, 0x05, 0x06], /** ZIP spanned: PK\x07\x08 */ ZIP_SPANNED: [0x50, 0x4b, 0x07, 0x08], /** GZIP: \x1f\x8b */ GZIP: [0x1f, 0x8b], /** BZIP2: BZ */ BZIP2: [0x42, 0x5a], /** RAR: Rar!\x1a\x07 */ RAR: [0x52, 0x61, 0x72, 0x21, 0x1a, 0x07], /** 7-Zip: 7z\xbc\xaf\x27\x1c */ SEVEN_ZIP: [0x37, 0x7a, 0xbc, 0xaf, 0x27, 0x1c], }; // ============================================================================= // ARCHIVE PROCESSOR CLASS // ============================================================================= /** * Archive Processor - handles ZIP, TAR, TAR.GZ, and plain GZ files. * * Overrides the base `processFile()` to implement a custom pipeline: * 1. Validate file type and size * 2. Obtain the archive buffer (from provided buffer or URL download) * 3. Detect the archive format via magic bytes and file extension * 4. Run security validation (path traversal, zip bombs, encryption, symlinks) * 5. Extract entry metadata (no recursive file processing in Phase 1) * 6. Build LLM-friendly text content with file listing * * RAR and 7z formats are detected but not yet supported for extraction. * * @example * ```typescript * const processor = new ArchiveProcessor(); * * const result = await processor.processFile(fileInfo, { * authHeaders: { Authorization: "Bearer token" }, * }); * * if (result.success) { * console.log(`Format: ${result.data.archiveMetadata.format}`); * console.log(`Entries: ${result.data.entries.length}`); * console.log(result.data.textContent); * } * ``` */ export class ArchiveProcessor extends BaseFileProcessor { constructor() { super({ maxSizeMB: ARCHIVE_CONFIG.MAX_SIZE_MB, timeoutMs: ARCHIVE_CONFIG.TIMEOUT_MS, supportedMimeTypes: [...SUPPORTED_ARCHIVE_MIME_TYPES], supportedExtensions: [...SUPPORTED_ARCHIVE_EXTENSIONS], fileTypeName: "archive", defaultFilename: "archive.zip", }); } // =========================================================================== // ABSTRACT METHOD IMPLEMENTATION // =========================================================================== /** * Build a stub processed result. * The actual work is done in the `processFile()` override; this method * satisfies the abstract contract from `BaseFileProcessor`. * * @param buffer - Raw archive buffer * @param fileInfo - Original file information * @returns Empty ProcessedArchive scaffold */ buildProcessedResult(buffer, fileInfo) { return { buffer, mimetype: fileInfo.mimetype || "application/octet-stream", size: buffer.length, filename: this.getFilename(fileInfo), textContent: "", archiveMetadata: { format: "zip", totalEntries: 0, totalUncompressedSize: 0, totalCompressedSize: 0, }, entries: [], securityWarnings: [], }; } // =========================================================================== // MAIN PROCESSING PIPELINE (override) // =========================================================================== /** * Process an archive file through the full extraction pipeline. * * @param fileInfo - File information (can include URL or buffer) * @param options - Optional processing options (auth headers, timeout, etc.) * @returns Processing result with archive metadata and entry listing, or error */ async processFile(fileInfo, options) { try { // Step 1: Validate file type and size const validationResult = this.validateFileWithResult(fileInfo); if (!validationResult.success) { return { success: false, error: validationResult.error }; } // Step 2: Get file buffer let buffer; if (fileInfo.buffer) { buffer = fileInfo.buffer; } else if (fileInfo.url) { const downloadResult = await this.downloadFileWithRetry(fileInfo, options); if (!downloadResult.success) { return { success: false, error: downloadResult.error }; } if (!downloadResult.data) { return { success: false, error: this.createError(FileErrorCode.DOWNLOAD_FAILED, { reason: "Download succeeded but returned no data", }), }; } buffer = downloadResult.data; // Validate actual downloaded size against limit if (!this.validateFileSize(buffer.length)) { return { success: false, error: this.createError(FileErrorCode.FILE_TOO_LARGE, { sizeMB: (buffer.length / (1024 * 1024)).toFixed(2), maxMB: this.config.maxSizeMB, type: this.config.fileTypeName, }), }; } } else { return { success: false, error: this.createError(FileErrorCode.DOWNLOAD_FAILED, { reason: "No buffer or URL provided for file", }), }; } // Step 3: Detect archive format const filename = this.getFilename(fileInfo); const format = this.detectArchiveFormat(buffer, filename); if (!format) { return { success: false, error: this.createError(FileErrorCode.INVALID_FORMAT, { reason: "Unable to detect archive format from magic bytes or file extension", }), }; } // Step 4: Check for unsupported formats (RAR, 7z) if (format === "rar" || format === "7z") { return { success: false, error: this.createError(FileErrorCode.UNSUPPORTED_TYPE, { format, reason: `${format.toUpperCase()} archives are not yet supported. Please convert to ZIP or TAR format.`, supportedFormats: "ZIP, TAR, TAR.GZ, GZ", }), }; } // Step 5: Extract entries based on format const extractionResult = await this.extractEntries(buffer, format); if (!extractionResult.success) { return { success: false, error: extractionResult.error, }; } const { entries, securityWarnings } = extractionResult; // Step 6: Compute aggregate metadata const totalUncompressedSize = entries.reduce((sum, e) => sum + e.uncompressedSize, 0); const totalCompressedSize = entries.reduce((sum, e) => sum + e.compressedSize, 0); // Step 7: Security check - overall compression ratio if (buffer.length > 0 && totalUncompressedSize > 0) { const overallRatio = totalUncompressedSize / buffer.length; if (overallRatio > ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO) { return { success: false, error: this.createError(FileErrorCode.ZIP_BOMB_DETECTED, { compressionRatio: overallRatio.toFixed(1), maxRatio: ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO, }), }; } } // Step 8: Security check - total decompressed size if (totalUncompressedSize > ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE) { return { success: false, error: this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, { reason: `Total decompressed size (${this.formatSizeMB(totalUncompressedSize)} MB) exceeds limit (${this.formatSizeMB(ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE)} MB)`, }), }; } // Step 9: Extract content from text-based entries (Phase 2 sub-processing) // For ZIP archives, extract and include content from small text-based files. // Skips nested archives and binary files for safety. let extractedContents = new Map(); if (format === "zip") { extractedContents = await this.extractEntryContents(buffer, entries); } // Step 10: Build text content for LLM const archiveMetadata = { format, totalEntries: entries.length, totalUncompressedSize, totalCompressedSize, }; const textContent = this.buildTextContent(filename, archiveMetadata, entries, securityWarnings, extractedContents); // Step 10: Build final result return { success: true, data: { buffer, mimetype: fileInfo.mimetype || "application/octet-stream", size: buffer.length, filename, textContent, archiveMetadata, entries, securityWarnings, }, }; } catch (error) { return { success: false, error: this.createError(FileErrorCode.PROCESSING_FAILED, { fileType: "archive", error: error instanceof Error ? error.message : String(error), }, error instanceof Error ? error : undefined), }; } } // =========================================================================== // FORMAT DETECTION // =========================================================================== /** * Detect the archive format using magic bytes and file extension. * Magic bytes take precedence over extension when available. * * @param buffer - Raw archive buffer * @param filename - Original filename for extension-based fallback * @returns Detected archive format, or null if unrecognized */ detectArchiveFormat(buffer, filename) { // Try magic bytes first (most reliable) const magicFormat = this.detectFormatFromMagicBytes(buffer); if (magicFormat) { // For GZIP, check if it wraps a TAR archive if (magicFormat === "gz") { const ext = filename.toLowerCase(); if (ext.endsWith(".tar.gz") || ext.endsWith(".tgz") || ext.endsWith(".tbz2")) { return "tar.gz"; } // Could still be a tar.gz without the extension - we'll detect during extraction return "gz"; } return magicFormat; } // Fallback to extension-based detection return this.detectFormatFromExtension(filename); } /** * Detect archive format from magic bytes at the start of the buffer. * * @param buffer - Raw archive buffer * @returns Detected format, or null if magic bytes don't match any known format */ detectFormatFromMagicBytes(buffer) { if (buffer.length < 2) { return null; } // Check for 7-Zip (6 bytes) if (buffer.length >= 6 && this.matchesMagic(buffer, MAGIC_BYTES.SEVEN_ZIP)) { return "7z"; } // Check for RAR (6+ bytes) if (buffer.length >= 6 && this.matchesMagic(buffer, MAGIC_BYTES.RAR)) { return "rar"; } // Check for ZIP/JAR (4 bytes) if (buffer.length >= 4 && (this.matchesMagic(buffer, MAGIC_BYTES.ZIP) || this.matchesMagic(buffer, MAGIC_BYTES.ZIP_EMPTY) || this.matchesMagic(buffer, MAGIC_BYTES.ZIP_SPANNED))) { return "zip"; } // Check for GZIP (2 bytes) if (this.matchesMagic(buffer, MAGIC_BYTES.GZIP)) { return "gz"; } // Check for BZIP2 (2 bytes) if (this.matchesMagic(buffer, MAGIC_BYTES.BZIP2)) { return "tar.bz2"; } return null; } /** * Detect archive format from file extension. * * @param filename - Filename to extract extension from * @returns Detected format, or null if extension is unrecognized */ detectFormatFromExtension(filename) { const lowerFilename = filename.toLowerCase(); if (lowerFilename.endsWith(".tar.gz") || lowerFilename.endsWith(".tgz")) { return "tar.gz"; } if (lowerFilename.endsWith(".tar.bz2") || lowerFilename.endsWith(".tbz2")) { return "tar.bz2"; } if (lowerFilename.endsWith(".tar")) { return "tar"; } if (lowerFilename.endsWith(".gz")) { return "gz"; } if (lowerFilename.endsWith(".bz2")) { return "tar.bz2"; } if (lowerFilename.endsWith(".zip") || lowerFilename.endsWith(".jar")) { return "zip"; } if (lowerFilename.endsWith(".rar")) { return "rar"; } if (lowerFilename.endsWith(".7z")) { return "7z"; } return null; } /** * Check if a buffer starts with the given magic byte sequence. * * @param buffer - Buffer to check * @param magic - Expected byte sequence * @returns true if the buffer starts with the magic bytes */ matchesMagic(buffer, magic) { for (let i = 0; i < magic.length; i++) { if (buffer[i] !== magic[i]) { return false; } } return true; } // =========================================================================== // ENTRY EXTRACTION // =========================================================================== /** * Extract entry metadata from the archive. * Delegates to format-specific extraction methods. * * @param buffer - Raw archive buffer * @param format - Detected archive format * @returns Extraction result with entries and security warnings, or error */ async extractEntries(buffer, format) { switch (format) { case "zip": return this.extractZipEntries(buffer); case "tar": return this.extractTarEntries(buffer); case "tar.gz": return this.extractTarGzEntries(buffer); case "tar.bz2": return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.UNSUPPORTED_TYPE, { format: "tar.bz2", reason: "TAR.BZ2 archives are not yet supported. Please convert to ZIP or TAR.GZ format.", supportedFormats: "ZIP, TAR, TAR.GZ, GZ", }), }; case "gz": return this.extractGzEntries(buffer); default: return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.UNSUPPORTED_TYPE, { format, reason: `${format} archives are not supported`, supportedFormats: "ZIP, TAR, TAR.GZ, GZ", }), }; } } // =========================================================================== // ZIP EXTRACTION // =========================================================================== /** * Extract entry metadata from a ZIP archive. * Validates each entry for path traversal, encryption, symlinks, and size limits. * * @param buffer - Raw ZIP buffer * @returns Extraction result with entries, security warnings, or error */ async extractZipEntries(buffer) { const entries = []; const securityWarnings = []; try { const AdmZip = (await import("adm-zip")).default; const zip = new AdmZip(buffer); const zipEntries = zip.getEntries(); // Check entry count limit if (zipEntries.length > ARCHIVE_SECURITY.MAX_ENTRIES) { return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, { reason: `Archive contains ${zipEntries.length} entries, exceeding the limit of ${ARCHIVE_SECURITY.MAX_ENTRIES}`, }), }; } let cumulativeUncompressedSize = 0; for (const entry of zipEntries) { const entryName = entry.entryName; // Security: path traversal check if (this.hasPathTraversal(entryName)) { securityWarnings.push(`Path traversal detected in entry: "${entryName}" - entry skipped`); continue; } // Security: path length check if (entryName.length > ARCHIVE_SECURITY.MAX_PATH_LENGTH) { securityWarnings.push(`Entry name exceeds maximum path length (${ARCHIVE_SECURITY.MAX_PATH_LENGTH}): "${entryName.substring(0, 50)}..." - entry skipped`); continue; } // Security: encrypted entry check if (entry.header.flags & 0x01) { if (!ARCHIVE_SECURITY.ALLOW_ENCRYPTED) { securityWarnings.push(`Encrypted entry detected: "${entryName}" - entry skipped`); continue; } } // Security: symlink check (ZIP external attributes) const externalAttr = entry.header.attr >>> 16; const isSymlink = (externalAttr & 0xa000) === 0xa000; if (isSymlink && !ARCHIVE_SECURITY.ALLOW_SYMLINKS) { securityWarnings.push(`Symbolic link detected: "${entryName}" - entry skipped`); continue; } const isDirectory = entry.isDirectory; const uncompressedSize = entry.header.size; const compressedSize = entry.header.compressedSize; // Security: single file size check if (!isDirectory && uncompressedSize > ARCHIVE_SECURITY.MAX_SINGLE_FILE_SIZE) { securityWarnings.push(`Entry "${entryName}" exceeds single file size limit (${this.formatSizeMB(uncompressedSize)} MB > ${this.formatSizeMB(ARCHIVE_SECURITY.MAX_SINGLE_FILE_SIZE)} MB) - entry listed but flagged`); } // Security: per-entry compression ratio check if (compressedSize > 0 && !isDirectory) { const ratio = uncompressedSize / compressedSize; if (ratio > ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO) { return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.ZIP_BOMB_DETECTED, { entryName, compressionRatio: ratio.toFixed(1), maxRatio: ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO, }), }; } } // Cumulative decompressed size check cumulativeUncompressedSize += uncompressedSize; if (cumulativeUncompressedSize > ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE) { return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, { reason: `Cumulative decompressed size exceeds limit of ${this.formatSizeMB(ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE)} MB`, }), }; } entries.push({ name: entryName, uncompressedSize, compressedSize, isDirectory, }); } return { success: true, entries, securityWarnings }; } catch (error) { return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.CORRUPTED_FILE, { reason: `Failed to read ZIP archive: ${error instanceof Error ? error.message : String(error)}`, }, error instanceof Error ? error : undefined), }; } } // =========================================================================== // TAR EXTRACTION // =========================================================================== /** * Extract entry metadata from a plain TAR archive. * * @param buffer - Raw TAR buffer * @returns Extraction result with entries and security warnings, or error */ async extractTarEntries(buffer) { try { const tarStream = await import("tar-stream"); return await this.parseTarStream(tarStream, buffer); } catch (error) { return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.CORRUPTED_FILE, { reason: `Failed to read TAR archive: ${error instanceof Error ? error.message : String(error)}`, }, error instanceof Error ? error : undefined), }; } } /** * Extract entry metadata from a GZIP-compressed TAR archive. * First decompresses with zlib, then parses as TAR. * * @param buffer - Raw TAR.GZ buffer * @returns Extraction result with entries and security warnings, or error */ async extractTarGzEntries(buffer) { try { const zlib = await import("zlib"); const { promisify } = await import("util"); const gunzip = promisify(zlib.gunzip); const decompressed = await gunzip(buffer); const tarBuffer = Buffer.from(decompressed); // Security: check decompressed size if (tarBuffer.length > ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE) { return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, { reason: `Decompressed TAR size (${this.formatSizeMB(tarBuffer.length)} MB) exceeds limit (${this.formatSizeMB(ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE)} MB)`, }), }; } // Security: check compression ratio if (buffer.length > 0) { const ratio = tarBuffer.length / buffer.length; if (ratio > ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO) { return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.ZIP_BOMB_DETECTED, { compressionRatio: ratio.toFixed(1), maxRatio: ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO, }), }; } } const tarStream = await import("tar-stream"); return await this.parseTarStream(tarStream, tarBuffer); } catch (error) { // Check if the error is one we already created (security validation) if (error && typeof error === "object" && "code" in error && typeof error.code === "string") { // Re-throw our structured errors return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.DECOMPRESSION_FAILED, { reason: `Failed to decompress TAR.GZ archive: ${error instanceof Error ? error.message : String(error)}`, }, error instanceof Error ? error : undefined), }; } return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.DECOMPRESSION_FAILED, { reason: `Failed to decompress TAR.GZ archive: ${error instanceof Error ? error.message : String(error)}`, }, error instanceof Error ? error : undefined), }; } } /** * Parse a TAR stream and extract entry metadata. * Shared between plain TAR and decompressed TAR.GZ processing. * * @param tarStream - The imported tar-stream module * @param buffer - Raw (decompressed) TAR buffer * @returns Extraction result with entries and security warnings, or error */ async parseTarStream(tarStream, buffer) { return new Promise((resolve) => { const entries = []; const securityWarnings = []; let entryCount = 0; let cumulativeSize = 0; let earlyError = null; const extract = tarStream.extract(); extract.on("entry", (header, stream, next) => { entryCount++; // Security: entry count limit if (entryCount > ARCHIVE_SECURITY.MAX_ENTRIES) { earlyError = this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, { reason: `Archive contains more than ${ARCHIVE_SECURITY.MAX_ENTRIES} entries`, }); stream.resume(); extract.destroy(); return; } const entryName = header.name || ""; const entrySize = header.size || 0; const entryType = header.type || "file"; // Security: path traversal if (this.hasPathTraversal(entryName)) { securityWarnings.push(`Path traversal detected in entry: "${entryName}" - entry skipped`); stream.resume(); next(); return; } // Security: path length if (entryName.length > ARCHIVE_SECURITY.MAX_PATH_LENGTH) { securityWarnings.push(`Entry name exceeds maximum path length (${ARCHIVE_SECURITY.MAX_PATH_LENGTH}): "${entryName.substring(0, 50)}..." - entry skipped`); stream.resume(); next(); return; } // Security: symlinks if ((entryType === "symlink" || entryType === "link") && !ARCHIVE_SECURITY.ALLOW_SYMLINKS) { securityWarnings.push(`Symbolic/hard link detected: "${entryName}" - entry skipped`); stream.resume(); next(); return; } const isDirectory = entryType === "directory"; // Security: single file size if (!isDirectory && entrySize > ARCHIVE_SECURITY.MAX_SINGLE_FILE_SIZE) { securityWarnings.push(`Entry "${entryName}" exceeds single file size limit (${this.formatSizeMB(entrySize)} MB > ${this.formatSizeMB(ARCHIVE_SECURITY.MAX_SINGLE_FILE_SIZE)} MB) - entry listed but flagged`); } // Security: cumulative size cumulativeSize += entrySize; if (cumulativeSize > ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE) { earlyError = this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, { reason: `Cumulative entry size exceeds limit of ${this.formatSizeMB(ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE)} MB`, }); stream.resume(); extract.destroy(); return; } entries.push({ name: entryName, uncompressedSize: entrySize, compressedSize: 0, // TAR doesn't compress individual entries isDirectory, }); // Consume the stream without buffering (we only need metadata) stream.resume(); next(); }); extract.on("finish", () => { if (earlyError) { resolve({ success: false, entries: [], securityWarnings: [], error: earlyError, }); } else { resolve({ success: true, entries, securityWarnings }); } }); extract.on("error", (err) => { if (earlyError) { resolve({ success: false, entries: [], securityWarnings: [], error: earlyError, }); } else { resolve({ success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.CORRUPTED_FILE, { reason: `Failed to parse TAR archive: ${err.message}`, }, err), }); } }); // Feed the buffer into the extract stream extract.end(buffer); }); } // =========================================================================== // GZIP EXTRACTION (plain, non-TAR) // =========================================================================== /** * Extract metadata from a plain GZIP file (single compressed file, not a TAR). * Since plain GZ wraps a single file, we create a single entry using the * original filename minus the .gz extension. * * @param buffer - Raw GZIP buffer * @returns Extraction result with a single entry and security warnings, or error */ async extractGzEntries(buffer) { try { const zlib = await import("zlib"); const { promisify } = await import("util"); const gunzip = promisify(zlib.gunzip); const decompressed = await gunzip(buffer); // Security: check decompressed size if (decompressed.length > ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE) { return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.SECURITY_VALIDATION_FAILED, { reason: `Decompressed size (${this.formatSizeMB(decompressed.length)} MB) exceeds limit (${this.formatSizeMB(ARCHIVE_SECURITY.MAX_DECOMPRESSED_SIZE)} MB)`, }), }; } // Security: compression ratio if (buffer.length > 0) { const ratio = decompressed.length / buffer.length; if (ratio > ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO) { return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.ZIP_BOMB_DETECTED, { compressionRatio: ratio.toFixed(1), maxRatio: ARCHIVE_SECURITY.MAX_COMPRESSION_RATIO, }), }; } } // Check if the decompressed content is actually a TAR if (this.looksLikeTar(decompressed)) { // It's actually a tar.gz; re-route through TAR extraction const tarStream = await import("tar-stream"); return await this.parseTarStream(tarStream, Buffer.from(decompressed)); } // Plain GZ - single entry // Derive the inner filename by removing the .gz extension const innerFilename = "decompressed-content"; const securityWarnings = []; const entries = [ { name: innerFilename, uncompressedSize: decompressed.length, compressedSize: buffer.length, isDirectory: false, }, ]; return { success: true, entries, securityWarnings }; } catch (error) { return { success: false, entries: [], securityWarnings: [], error: this.createError(FileErrorCode.DECOMPRESSION_FAILED, { reason: `Failed to decompress GZIP file: ${error instanceof Error ? error.message : String(error)}`, }, error instanceof Error ? error : undefined), }; } } /** * Heuristic check to determine if a buffer looks like a TAR archive. * TAR archives have a "ustar" magic string at byte offset 257. * * @param buffer - Decompressed buffer to check * @returns true if the buffer appears to be a TAR archive */ looksLikeTar(buffer) { if (buffer.length < 263) { return false; } // "ustar" at offset 257 const magic = Buffer.from(buffer.slice(257, 263)).toString("ascii"); return magic.startsWith("ustar"); } // =========================================================================== // SECURITY VALIDATION // =========================================================================== /** * Check if an entry name contains path traversal sequences. * Detects `../`, absolute paths, and other traversal vectors. * * @param entryName - Archive entry name/path to validate * @returns true if path traversal is detected */ hasPathTraversal(entryName) { // Normalize separators const normalized = entryName.replace(/\\/g, "/"); // Check for parent directory traversal if (normalized.includes("../") || normalized.includes("/..")) { return true; } // Check for absolute paths if (normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized)) { return true; } // Check resolved path doesn't escape root const resolved = path.posix.normalize(normalized); if (resolved.startsWith("../") || resolved === "..") { return true; } return false; } // =========================================================================== // CONTENT EXTRACTION (Phase 2 sub-processing) // =========================================================================== /** * Extract text content from eligible ZIP entries for LLM consumption. * * Selects small, text-based files from the archive and extracts their * content. Files are sorted by relevance (config files, source code, docs). * Binary files, nested archives, and files exceeding size limits are skipped. * * @param buffer - Raw ZIP archive buffer * @param entries - Previously extracted entry metadata * @returns Map of entry name to extracted text content */ async extractEntryContents(buffer, entries) { const contents = new Map(); try { const AdmZip = (await import("adm-zip")).default; const zip = new AdmZip(buffer); // Filter to extractable text-based entries within size limits const candidates = entries .filter((e) => { if (e.isDirectory) { return false; } if (e.uncompressedSize > ARCHIVE_CONFIG.MAX_EXTRACT_ENTRY_SIZE) { return false; } if (e.uncompressedSize === 0) { return false; } const ext = path.extname(e.name).toLowerCase(); // Check by extension if (ARCHIVE_CONFIG.EXTRACTABLE_EXTENSIONS.has(ext)) { return true; } // Check for common extensionless config files const basename = path.basename(e.name).toLowerCase(); if (basename === "readme" || basename === "license" || basename === "makefile" || basename === "dockerfile") { return true; } return false; }) // Sort: smaller files first (more likely to fit), then by name .sort((a, b) => a.uncompressedSize - b.uncompressedSize); let totalExtracted = 0; let extractCount = 0; for (const entry of candidates) { if (extractCount >= ARCHIVE_CONFIG.MAX_EXTRACT_ENTRIES) { break; } if (totalExtracted + entry.uncompressedSize > ARCHIVE_CONFIG.MAX_TOTAL_EXTRACT_SIZE) { break; } try { const zipEntry = zip.getEntry(entry.name); if (!zipEntry) { continue; } const data = zipEntry.getData(); if (!data || data.length === 0) { continue; } // Simple binary detection: check for null bytes in first 512 bytes const sample = data.slice(0, Math.min(512, data.length)); if (sample.includes(0)) { continue; } const text = data.toString("utf-8"); // Sanity check: skip if too many replacement characters (likely binary) const replacementCount = (text.match(/\ufffd/g) || []).length; if (replacementCount > text.length * 0.05) { continue; } contents.set(entry.name, text); totalExtracted += data.length; extractCount++; } catch { // Skip entries that fail to extract (binary, corrupt, etc.) } } } catch { // If ZIP re-parsing fails, return empty — listing is still available } return contents; } // =========================================================================== // TEXT CONTENT BUILDING // =========================================================================== /** * Build a structured text description of the archive for LLM consumption. * Includes archive metadata, file listing with sizes, and security warnings. * * @param filename - Original archive filename * @param metadata - Aggregate archive metadata * @param entries - Individual entry metadata * @param securityWarnings - Security warnings encountered during processing * @param extractedContents - Map of entry name to extracted text content (Phase 2) * @returns Formatted text content string */ buildTextContent(filename, metadata, entries, securityWarnings, extractedContents) { const lines = []; // Header lines.push(`## Archive: ${filename}`); lines.push(""); // Metadata lines.push("### Metadata"); lines.push(`- **Format:** ${metadata.format.toUpperCase()}`); lines.push(`- **Total entries:** ${metadata.totalEntries}`); lines.push(`- **Total uncompressed size:** ${this.formatHumanReadableSize(metadata.totalUncompressedSize)}`); if (metadata.totalCompressedSize > 0) { lines.push(`- **Total compressed size:** ${this.formatHumanReadableSize(metadata.totalCompressedSize)}`); } lines.push(""); // Security warnings if (securityWarnings.length > 0) { lines.push("### Security Warnings"); for (const warning of securityWarnings) { lines.push(`- ${warning}`); } lines.push(""); } // File listing lines.push("### Contents"); lines.push(""); // Separate directories and files const directories = entries.filter((e) => e.isDirectory); const files = entries.filter((e) => !e.isDirectory); if (directories.length > 0) { lines.push(`**Directories (${directories.length}):**`); for (const dir of directories) { lines.push(` ${dir.name}`); } lines.push(""); } if (files.length > 0) { lines.push(`**Files (${files.length}):**`); // Sort files by path for readability const sortedFiles = [...files].sort((a, b) => a.name.localeCompare(b.name)); for (const file of sortedFiles) { const sizeStr = this.formatHumanReadableSize(file.uncompressedSize); lines.push(` ${file.name} (${sizeStr})`); } lines.push(""); } if (entries.length === 0) { lines.push("*Archive is empty.*"); lines.push(""); } // Extracted file contents (Phase 2 sub-processing) if (extractedContents && extractedContents.size > 0) { lines.push("### Extracted File Contents"); lines.push(""); extractedContents.forEach((content, entryName) => { const ext = path.extname(entryName).replace(".", ""); const langHint = ext || ""; lines.push(`#### ${entryName}`); lines.push(`\`\`\`${langHint}`); // Truncate very long file contents to avoid excessive token usage if (content.length > 10000) { lines.push(content.slice(0, 8000)); lines.push(`\n... [truncated ${content.length - 8000} characters] ...`); lines.push(content.slice(-1000)); } else { lines.push(content); } lines.push("```"); lines.push(""); }); } return lines.join("\n"); } /** * Format a byte count as a human-readable size string. * * @param bytes - Size in bytes * @returns Formatted string (e.g., "1.5 MB", "256 KB", "128 B") */ formatHumanReadableSize(bytes) { if (bytes === 0) { return "0 B"; } const units = ["B", "KB", "MB", "GB"]; const k = 1024; const i = Math.floor(Math.log(bytes) / Math.log(k)); const idx = Math.min(i, units.length - 1); return `${parseFloat((bytes / k ** idx).toFixed(2))} ${units[idx]}`; } // ===========================================================================