@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
287 lines (286 loc) • 11 kB
TypeScript
/**
* Archive Processor
*
* Handles downloading, validating, and processing archive files (ZIP, TAR, TAR.GZ, GZ).
* Extracts file listings with metadata for AI consumption without recursively
* processing individual entries through other processors (Phase 1).
*
* Key features:
* - ZIP support via adm-zip (dynamic import)
* - TAR / TAR.GZ support via tar-stream (dynamic import)
* - Plain GZ support via Node zlib
* - Comprehensive security validation (path traversal, zip bombs, symlinks, encryption)
* - In-memory extraction with configurable size limits
* - Structured text output for LLM consumption
*
* @module processors/archive/ArchiveProcessor
*
* @example
* ```typescript
* import { archiveProcessor, processArchive, isArchiveFile } from "./ArchiveProcessor.js";
*
* // Check if a file is an archive
* if (isArchiveFile(fileInfo.mimetype, fileInfo.name)) {
* const result = await processArchive(fileInfo, {
* authHeaders: { Authorization: "Bearer token" },
* });
*
* if (result.success) {
* console.log(`Format: ${result.data.archiveMetadata.format}`);
* console.log(`Entries: ${result.data.archiveMetadata.totalEntries}`);
* for (const entry of result.data.entries) {
* console.log(` ${entry.name} (${entry.uncompressedSize} bytes)`);
* }
* }
* }
* ```
*/
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
import type { FileInfo, ProcessedArchive, ProcessorFileProcessingResult, ProcessOptions } from "../../types/index.js";
/**
* Archive Processor - handles ZIP, TAR, TAR.GZ, and plain GZ files.
*
* Overrides the base `processFile()` to implement a custom pipeline:
* 1. Validate file type and size
* 2. Obtain the archive buffer (from provided buffer or URL download)
* 3. Detect the archive format via magic bytes and file extension
* 4. Run security validation (path traversal, zip bombs, encryption, symlinks)
* 5. Extract entry metadata (no recursive file processing in Phase 1)
* 6. Build LLM-friendly text content with file listing
*
* RAR and 7z formats are detected but not yet supported for extraction.
*
* @example
* ```typescript
* const processor = new ArchiveProcessor();
*
* const result = await processor.processFile(fileInfo, {
* authHeaders: { Authorization: "Bearer token" },
* });
*
* if (result.success) {
* console.log(`Format: ${result.data.archiveMetadata.format}`);
* console.log(`Entries: ${result.data.entries.length}`);
* console.log(result.data.textContent);
* }
* ```
*/
export declare class ArchiveProcessor extends BaseFileProcessor<ProcessedArchive> {
constructor();
/**
* Build a stub processed result.
* The actual work is done in the `processFile()` override; this method
* satisfies the abstract contract from `BaseFileProcessor`.
*
* @param buffer - Raw archive buffer
* @param fileInfo - Original file information
* @returns Empty ProcessedArchive scaffold
*/
protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedArchive;
/**
* Process an archive file through the full extraction pipeline.
*
* @param fileInfo - File information (can include URL or buffer)
* @param options - Optional processing options (auth headers, timeout, etc.)
* @returns Processing result with archive metadata and entry listing, or error
*/
processFile(fileInfo: FileInfo, options?: ProcessOptions): Promise<ProcessorFileProcessingResult<ProcessedArchive>>;
/**
* Detect the archive format using magic bytes and file extension.
* Magic bytes take precedence over extension when available.
*
* @param buffer - Raw archive buffer
* @param filename - Original filename for extension-based fallback
* @returns Detected archive format, or null if unrecognized
*/
private detectArchiveFormat;
/**
* Detect archive format from magic bytes at the start of the buffer.
*
* @param buffer - Raw archive buffer
* @returns Detected format, or null if magic bytes don't match any known format
*/
private detectFormatFromMagicBytes;
/**
* Detect archive format from file extension.
*
* @param filename - Filename to extract extension from
* @returns Detected format, or null if extension is unrecognized
*/
private detectFormatFromExtension;
/**
* Check if a buffer starts with the given magic byte sequence.
*
* @param buffer - Buffer to check
* @param magic - Expected byte sequence
* @returns true if the buffer starts with the magic bytes
*/
private matchesMagic;
/**
* Extract entry metadata from the archive.
* Delegates to format-specific extraction methods.
*
* @param buffer - Raw archive buffer
* @param format - Detected archive format
* @returns Extraction result with entries and security warnings, or error
*/
private extractEntries;
/**
* Extract entry metadata from a ZIP archive.
* Validates each entry for path traversal, encryption, symlinks, and size limits.
*
* @param buffer - Raw ZIP buffer
* @returns Extraction result with entries, security warnings, or error
*/
private extractZipEntries;
/**
* Extract entry metadata from a plain TAR archive.
*
* @param buffer - Raw TAR buffer
* @returns Extraction result with entries and security warnings, or error
*/
private extractTarEntries;
/**
* Extract entry metadata from a GZIP-compressed TAR archive.
* First decompresses with zlib, then parses as TAR.
*
* @param buffer - Raw TAR.GZ buffer
* @returns Extraction result with entries and security warnings, or error
*/
private extractTarGzEntries;
/**
* Parse a TAR stream and extract entry metadata.
* Shared between plain TAR and decompressed TAR.GZ processing.
*
* @param tarStream - The imported tar-stream module
* @param buffer - Raw (decompressed) TAR buffer
* @returns Extraction result with entries and security warnings, or error
*/
private parseTarStream;
/**
* Extract metadata from a plain GZIP file (single compressed file, not a TAR).
* Since plain GZ wraps a single file, we create a single entry using the
* original filename minus the .gz extension.
*
* @param buffer - Raw GZIP buffer
* @returns Extraction result with a single entry and security warnings, or error
*/
private extractGzEntries;
/**
* Heuristic check to determine if a buffer looks like a TAR archive.
* TAR archives have a "ustar" magic string at byte offset 257.
*
* @param buffer - Decompressed buffer to check
* @returns true if the buffer appears to be a TAR archive
*/
private looksLikeTar;
/**
* Check if an entry name contains path traversal sequences.
* Detects `../`, absolute paths, and other traversal vectors.
*
* @param entryName - Archive entry name/path to validate
* @returns true if path traversal is detected
*/
private hasPathTraversal;
/**
* Extract text content from eligible ZIP entries for LLM consumption.
*
* Selects small, text-based files from the archive and extracts their
* content. Files are sorted by relevance (config files, source code, docs).
* Binary files, nested archives, and files exceeding size limits are skipped.
*
* @param buffer - Raw ZIP archive buffer
* @param entries - Previously extracted entry metadata
* @returns Map of entry name to extracted text content
*/
private extractEntryContents;
/**
* Build a structured text description of the archive for LLM consumption.
* Includes archive metadata, file listing with sizes, and security warnings.
*
* @param filename - Original archive filename
* @param metadata - Aggregate archive metadata
* @param entries - Individual entry metadata
* @param securityWarnings - Security warnings encountered during processing
* @param extractedContents - Map of entry name to extracted text content (Phase 2)
* @returns Formatted text content string
*/
private buildTextContent;
/**
* Format a byte count as a human-readable size string.
*
* @param bytes - Size in bytes
* @returns Formatted string (e.g., "1.5 MB", "256 KB", "128 B")
*/
private formatHumanReadableSize;
/**
* Extract a specific file from a ZIP archive and return its text content.
*
* Called by the `extract_file_content` tool for targeted access to files
* inside archives. Only supports ZIP archives (the most common format).
* Applies security checks (path traversal, size limits).
*
* @param buffer - Archive file buffer
* @param entryPath - Path of the entry within the archive (e.g., "src/index.ts")
* @returns Text content of the extracted file, or error message
*/
extractEntry(buffer: Buffer, entryPath: string): Promise<string>;
}
/**
* Singleton Archive processor instance.
* Use this for standard archive processing operations.
*
* @example
* ```typescript
* import { archiveProcessor } from "./ArchiveProcessor.js";
*
* const result = await archiveProcessor.processFile(fileInfo);
* ```
*/
export declare const archiveProcessor: ArchiveProcessor;
/**
* Check if a file is an archive file.
* Matches by MIME type or file extension.
*
* @param mimetype - MIME type of the file
* @param filename - Filename (for extension-based detection)
* @returns true if the file is a recognized archive format
*
* @example
* ```typescript
* if (isArchiveFile("application/zip", "backup.zip")) {
* // Process as archive
* }
*
* if (isArchiveFile("", "data.tar.gz")) {
* // Also matches by extension
* }
* ```
*/
export declare function isArchiveFile(mimetype: string, filename: string): boolean;
/**
* Process a single archive file.
* Convenience function that uses the singleton processor.
*
* @param fileInfo - File information (can include URL or buffer)
* @param options - Optional processing options (auth headers, timeout, etc.)
* @returns Processing result with archive metadata and entry listing, or error
*
* @example
* ```typescript
* import { processArchive } from "./ArchiveProcessor.js";
*
* const result = await processArchive(fileInfo, {
* authHeaders: { Authorization: "Bearer token" },
* });
*
* if (result.success) {
* const { archiveMetadata, entries, textContent } = result.data;
* console.log(`Found ${entries.length} entries in ${archiveMetadata.format} archive`);
* console.log(textContent);
* } else {
* console.error(`Processing failed: ${result.error?.userMessage}`);
* }
* ```
*/
export declare function processArchive(fileInfo: FileInfo, options?: ProcessOptions): Promise<ProcessorFileProcessingResult<ProcessedArchive>>;