UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

287 lines (286 loc) 11 kB
/** * Archive Processor * * Handles downloading, validating, and processing archive files (ZIP, TAR, TAR.GZ, GZ). * Extracts file listings with metadata for AI consumption without recursively * processing individual entries through other processors (Phase 1). * * Key features: * - ZIP support via adm-zip (dynamic import) * - TAR / TAR.GZ support via tar-stream (dynamic import) * - Plain GZ support via Node zlib * - Comprehensive security validation (path traversal, zip bombs, symlinks, encryption) * - In-memory extraction with configurable size limits * - Structured text output for LLM consumption * * @module processors/archive/ArchiveProcessor * * @example * ```typescript * import { archiveProcessor, processArchive, isArchiveFile } from "./ArchiveProcessor.js"; * * // Check if a file is an archive * if (isArchiveFile(fileInfo.mimetype, fileInfo.name)) { * const result = await processArchive(fileInfo, { * authHeaders: { Authorization: "Bearer token" }, * }); * * if (result.success) { * console.log(`Format: ${result.data.archiveMetadata.format}`); * console.log(`Entries: ${result.data.archiveMetadata.totalEntries}`); * for (const entry of result.data.entries) { * console.log(` ${entry.name} (${entry.uncompressedSize} bytes)`); * } * } * } * ``` */ import { BaseFileProcessor } from "../base/BaseFileProcessor.js"; import type { FileInfo, ProcessedArchive, ProcessorFileProcessingResult, ProcessOptions } from "../../types/index.js"; /** * Archive Processor - handles ZIP, TAR, TAR.GZ, and plain GZ files. * * Overrides the base `processFile()` to implement a custom pipeline: * 1. Validate file type and size * 2. Obtain the archive buffer (from provided buffer or URL download) * 3. Detect the archive format via magic bytes and file extension * 4. Run security validation (path traversal, zip bombs, encryption, symlinks) * 5. Extract entry metadata (no recursive file processing in Phase 1) * 6. Build LLM-friendly text content with file listing * * RAR and 7z formats are detected but not yet supported for extraction. * * @example * ```typescript * const processor = new ArchiveProcessor(); * * const result = await processor.processFile(fileInfo, { * authHeaders: { Authorization: "Bearer token" }, * }); * * if (result.success) { * console.log(`Format: ${result.data.archiveMetadata.format}`); * console.log(`Entries: ${result.data.entries.length}`); * console.log(result.data.textContent); * } * ``` */ export declare class ArchiveProcessor extends BaseFileProcessor<ProcessedArchive> { constructor(); /** * Build a stub processed result. * The actual work is done in the `processFile()` override; this method * satisfies the abstract contract from `BaseFileProcessor`. * * @param buffer - Raw archive buffer * @param fileInfo - Original file information * @returns Empty ProcessedArchive scaffold */ protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedArchive; /** * Process an archive file through the full extraction pipeline. * * @param fileInfo - File information (can include URL or buffer) * @param options - Optional processing options (auth headers, timeout, etc.) * @returns Processing result with archive metadata and entry listing, or error */ processFile(fileInfo: FileInfo, options?: ProcessOptions): Promise<ProcessorFileProcessingResult<ProcessedArchive>>; /** * Detect the archive format using magic bytes and file extension. * Magic bytes take precedence over extension when available. * * @param buffer - Raw archive buffer * @param filename - Original filename for extension-based fallback * @returns Detected archive format, or null if unrecognized */ private detectArchiveFormat; /** * Detect archive format from magic bytes at the start of the buffer. * * @param buffer - Raw archive buffer * @returns Detected format, or null if magic bytes don't match any known format */ private detectFormatFromMagicBytes; /** * Detect archive format from file extension. * * @param filename - Filename to extract extension from * @returns Detected format, or null if extension is unrecognized */ private detectFormatFromExtension; /** * Check if a buffer starts with the given magic byte sequence. * * @param buffer - Buffer to check * @param magic - Expected byte sequence * @returns true if the buffer starts with the magic bytes */ private matchesMagic; /** * Extract entry metadata from the archive. * Delegates to format-specific extraction methods. * * @param buffer - Raw archive buffer * @param format - Detected archive format * @returns Extraction result with entries and security warnings, or error */ private extractEntries; /** * Extract entry metadata from a ZIP archive. * Validates each entry for path traversal, encryption, symlinks, and size limits. * * @param buffer - Raw ZIP buffer * @returns Extraction result with entries, security warnings, or error */ private extractZipEntries; /** * Extract entry metadata from a plain TAR archive. * * @param buffer - Raw TAR buffer * @returns Extraction result with entries and security warnings, or error */ private extractTarEntries; /** * Extract entry metadata from a GZIP-compressed TAR archive. * First decompresses with zlib, then parses as TAR. * * @param buffer - Raw TAR.GZ buffer * @returns Extraction result with entries and security warnings, or error */ private extractTarGzEntries; /** * Parse a TAR stream and extract entry metadata. * Shared between plain TAR and decompressed TAR.GZ processing. * * @param tarStream - The imported tar-stream module * @param buffer - Raw (decompressed) TAR buffer * @returns Extraction result with entries and security warnings, or error */ private parseTarStream; /** * Extract metadata from a plain GZIP file (single compressed file, not a TAR). * Since plain GZ wraps a single file, we create a single entry using the * original filename minus the .gz extension. * * @param buffer - Raw GZIP buffer * @returns Extraction result with a single entry and security warnings, or error */ private extractGzEntries; /** * Heuristic check to determine if a buffer looks like a TAR archive. * TAR archives have a "ustar" magic string at byte offset 257. * * @param buffer - Decompressed buffer to check * @returns true if the buffer appears to be a TAR archive */ private looksLikeTar; /** * Check if an entry name contains path traversal sequences. * Detects `../`, absolute paths, and other traversal vectors. * * @param entryName - Archive entry name/path to validate * @returns true if path traversal is detected */ private hasPathTraversal; /** * Extract text content from eligible ZIP entries for LLM consumption. * * Selects small, text-based files from the archive and extracts their * content. Files are sorted by relevance (config files, source code, docs). * Binary files, nested archives, and files exceeding size limits are skipped. * * @param buffer - Raw ZIP archive buffer * @param entries - Previously extracted entry metadata * @returns Map of entry name to extracted text content */ private extractEntryContents; /** * Build a structured text description of the archive for LLM consumption. * Includes archive metadata, file listing with sizes, and security warnings. * * @param filename - Original archive filename * @param metadata - Aggregate archive metadata * @param entries - Individual entry metadata * @param securityWarnings - Security warnings encountered during processing * @param extractedContents - Map of entry name to extracted text content (Phase 2) * @returns Formatted text content string */ private buildTextContent; /** * Format a byte count as a human-readable size string. * * @param bytes - Size in bytes * @returns Formatted string (e.g., "1.5 MB", "256 KB", "128 B") */ private formatHumanReadableSize; /** * Extract a specific file from a ZIP archive and return its text content. * * Called by the `extract_file_content` tool for targeted access to files * inside archives. Only supports ZIP archives (the most common format). * Applies security checks (path traversal, size limits). * * @param buffer - Archive file buffer * @param entryPath - Path of the entry within the archive (e.g., "src/index.ts") * @returns Text content of the extracted file, or error message */ extractEntry(buffer: Buffer, entryPath: string): Promise<string>; } /** * Singleton Archive processor instance. * Use this for standard archive processing operations. * * @example * ```typescript * import { archiveProcessor } from "./ArchiveProcessor.js"; * * const result = await archiveProcessor.processFile(fileInfo); * ``` */ export declare const archiveProcessor: ArchiveProcessor; /** * Check if a file is an archive file. * Matches by MIME type or file extension. * * @param mimetype - MIME type of the file * @param filename - Filename (for extension-based detection) * @returns true if the file is a recognized archive format * * @example * ```typescript * if (isArchiveFile("application/zip", "backup.zip")) { * // Process as archive * } * * if (isArchiveFile("", "data.tar.gz")) { * // Also matches by extension * } * ``` */ export declare function isArchiveFile(mimetype: string, filename: string): boolean; /** * Process a single archive file. * Convenience function that uses the singleton processor. * * @param fileInfo - File information (can include URL or buffer) * @param options - Optional processing options (auth headers, timeout, etc.) * @returns Processing result with archive metadata and entry listing, or error * * @example * ```typescript * import { processArchive } from "./ArchiveProcessor.js"; * * const result = await processArchive(fileInfo, { * authHeaders: { Authorization: "Bearer token" }, * }); * * if (result.success) { * const { archiveMetadata, entries, textContent } = result.data; * console.log(`Found ${entries.length} entries in ${archiveMetadata.format} archive`); * console.log(textContent); * } else { * console.error(`Processing failed: ${result.error?.userMessage}`); * } * ``` */ export declare function processArchive(fileInfo: FileInfo, options?: ProcessOptions): Promise<ProcessorFileProcessingResult<ProcessedArchive>>;