zip-iterator
Version:
Extract contents from zip archive type using an iterator API using streams or paths. Use stream interface and pipe transforms to add decompression algorithms
588 lines (587 loc) • 24.9 kB
JavaScript
/**
* ZipExtract - Forward-Only ZIP Parser
*
* Parses ZIP files in a single forward pass using Local File Headers.
* Does not require seeking or the Central Directory.
*
* Uses native zlib on Node 0.11.12+, falls back to pako for older versions
*
* State Machine:
* ```
* SIGNATURE ──┬── LOCAL_HEADER ── FILE_DATA ──┬── DATA_DESCRIPTOR ──┐
* │ │ │
* └───────────────────────────────┴─────────────────────┘
* │
* └── CENTRAL_DIR/END ── FINISHED
* ```
*
* State Transitions:
* - SIGNATURE: Reads 4-byte signature to determine next state
* - Local File Header (0x04034b50) → LOCAL_HEADER
* - Central Directory (0x02014b50) → FINISHED
* - End of Central Dir (0x06054b50) → FINISHED
*
* - LOCAL_HEADER: Parses header, creates entry stream → FILE_DATA
*
* - FILE_DATA: Streams or buffers file content
* - Known size: reads bytesRemaining bytes → SIGNATURE
* - Data descriptor: scans for boundary → DATA_DESCRIPTOR
*
* - DATA_DESCRIPTOR: Parses descriptor, verifies CRC → SIGNATURE
*
* Events:
* 'entry' (header: LocalFileHeader, stream: Readable, next: () => void)
* 'error' (err: Error)
* 'finish' ()
*/ import { EventEmitter } from 'events';
import { BufferList, crc32, inflateRaw } from 'extract-base-iterator';
import { DeflateStreamHandler } from './compression/DeflateStream.js';
import { StoreHandler } from './compression/StoreStream.js';
import * as C from './constants.js';
import { findDeflateBoundary, findStoreDataEnd, getSafetyBufferSize } from './DataDescriptorParser.js';
import { createEntryStream, emitErrorToStream, emitStreamError } from './EntryEmitter.js';
import { parseDataDescriptor, parseLocalFileHeader } from './headers.js';
const State = {
SIGNATURE: 0,
LOCAL_HEADER: 1,
FILE_DATA: 2,
DATA_DESCRIPTOR: 3,
FINISHED: 4
};
let ZipExtract = class ZipExtract extends EventEmitter {
/**
* Write chunk to parser
*/ write(chunk, callback) {
if (this.ended) {
if (callback) callback();
return false;
}
this.buffer.append(chunk);
this.process();
if (callback) callback();
return !this.locked;
}
/**
* Signal end of input
*/ end(callback) {
var _ref;
var _this_compressionHandler;
// Guard against re-entrant calls (can happen when error handler triggers cleanup)
if (this.ended) {
if (callback) callback();
return;
}
this.ended = true;
// Check if we're waiting for async compression completion
const waitingForAsync = (_ref = (_this_compressionHandler = this.compressionHandler) === null || _this_compressionHandler === void 0 ? void 0 : _this_compressionHandler.isWaiting()) !== null && _ref !== void 0 ? _ref : false;
// If we have an active stream and we're in FILE_DATA state, the archive is truncated
// This handles the case where consumer called next() early but stream data is incomplete
// Exception: if we're waiting for async inflate completion, that's not truncation
if (this.currentStream && this.state === State.FILE_DATA && !waitingForAsync) {
const err = C.createZipError(this.bytesRemaining > 0 ? `Truncated archive: expected ${this.bytesRemaining} more bytes of file data` : 'Truncated archive: unexpected end of file data', C.ZipErrorCode.TRUNCATED_ARCHIVE);
const stream = this.currentStream;
this.currentStream = null;
// Emit error to stream - use deferred emission if no listeners yet
// This handles the race condition where end() is called before consumer attaches listeners
// NOTE: We do NOT call stream.end() here - the error should prevent normal completion
emitErrorToStream(stream, err);
// Emit to ZipExtract for iterator-level error handling
this.emitError(err);
if (callback) callback();
return;
}
// If not locked, process remaining data (process() will call checkEndState() when appropriate)
if (!this.locked) {
this.process();
} else {
// Even when locked, check for truncation in data-consuming states
// This handles the case where we're waiting for file data that will never arrive
this.checkLockedEndState();
}
if (callback) callback();
}
/**
* Check for truncation when input ends while locked
* This catches premature EOF during file data streaming
*/ checkLockedEndState() {
// If we're in FILE_DATA state and waiting for more data, that's a truncation
if (this.state === State.FILE_DATA) {
const header = this.currentHeader;
if (header) {
if (header.hasDataDescriptor) {
// For data descriptor entries, we're waiting for boundary signatures
// If input ends, it's truncated
this.emitError(C.createZipError('Truncated archive: unexpected end of file data', C.ZipErrorCode.TRUNCATED_ARCHIVE));
} else if (this.bytesRemaining > 0) {
// For known-size entries, if we still need data, it's truncated
this.emitError(C.createZipError(`Truncated archive: expected ${this.bytesRemaining} more bytes of file data`, C.ZipErrorCode.TRUNCATED_ARCHIVE));
} else {
// bytesRemaining is 0 or negative - entry data was complete
// This shouldn't normally happen when locked, but just in case
this.finishKnownSizeEntry();
}
}
} else if (this.state === State.DATA_DESCRIPTOR) {
// Waiting for data descriptor that won't arrive
this.emitError(C.createZipError('Truncated archive: unexpected end while reading data descriptor', C.ZipErrorCode.TRUNCATED_ARCHIVE));
}
}
/**
* Check if we ended in a valid state
*/ checkEndState() {
if (this.state === State.FINISHED) {
return; // Already finished
}
// SIGNATURE state with empty buffer is valid (between entries or empty archive)
if (this.state === State.SIGNATURE && this.buffer.length === 0) {
this.finish();
return;
}
// SIGNATURE state with data but no valid signature means we hit central directory or EOF
if (this.state === State.SIGNATURE && this.buffer.length > 0) {
// Check if it's the central directory (normal end)
if (this.buffer.startsWith(C.SIG_CENTRAL_DIR) || this.buffer.startsWith(C.SIG_END_OF_CENTRAL_DIR)) {
this.finish();
return;
}
}
// Any other state is unexpected
this.emitError(C.createZipError(`Unexpected end of input in state: ${this.state}`, C.ZipErrorCode.TRUNCATED_ARCHIVE));
}
// ===========================================================================
// Private Methods
// ===========================================================================
process() {
// Process as much as we can from the buffer
// Note: locked only prevents starting NEW entries, not processing current entry's data
while(true){
const processed = this.processState();
if (!processed) break;
}
// If input has ended and we're not processing an entry, check if we finished properly
// Note: locked may be false even with currentStream set (consumer called next() early)
// In that case, we're still actively processing file data and shouldn't error yet
if (this.ended && !this.locked && !this.currentStream) {
this.checkEndState();
}
}
processState() {
switch(this.state){
case State.SIGNATURE:
return this.processSignature();
case State.LOCAL_HEADER:
return this.processLocalHeader();
case State.FILE_DATA:
return this.processFileData();
case State.DATA_DESCRIPTOR:
return this.processDataDescriptor();
case State.FINISHED:
return false;
default:
return false;
}
}
/**
* Detect what signature comes next
*/ processSignature() {
// Don't start a new entry while locked (waiting for consumer to call next())
if (this.locked) {
return false;
}
if (this.buffer.length < C.SIGNATURE_SIZE) {
return false;
}
// Check for Local File Header
if (this.buffer.startsWith(C.SIG_LOCAL_FILE)) {
this.state = State.LOCAL_HEADER;
return true;
}
// Check for Central Directory (end of entries)
if (this.buffer.startsWith(C.SIG_CENTRAL_DIR)) {
this.finish();
return false;
}
// Check for End of Central Directory (empty archive)
if (this.buffer.startsWith(C.SIG_END_OF_CENTRAL_DIR)) {
this.finish();
return false;
}
// Unknown signature
this.emitError(C.createZipError(`Invalid ZIP signature: 0x${this.buffer.slice(0, 4).toString('hex')}`, C.ZipErrorCode.INVALID_SIGNATURE));
return false;
}
/**
* Parse Local File Header
*/ processLocalHeader() {
// Check if we have minimum header size
if (this.buffer.length < C.LOCAL_HEADER_FIXED_SIZE) {
return false;
}
// Use zero-copy reads to get filename and extra field lengths
// This avoids allocating buffers for the entire header parse
const fileNameLength = this.buffer.readUInt16LEAt(26);
const extraFieldLength = this.buffer.readUInt16LEAt(28);
if (fileNameLength === null || extraFieldLength === null) {
return false; // Need more data
}
const headerSize = C.LOCAL_HEADER_FIXED_SIZE + fileNameLength + extraFieldLength;
// Read exactly what's needed using readBytesAt (zero-copy for most cases)
const buf = this.buffer.readBytesAt(0, headerSize);
// parseLocalFileHeader expects a contiguous buffer
const header = parseLocalFileHeader(buf, 0);
if (!header) {
return false; // Need more data
}
// Check for encryption (traditional or strong/AES)
if (header.isEncrypted || header.isStrongEncrypted) {
this.emitError(C.createZipError('Encrypted ZIP entries are not supported', C.ZipErrorCode.ENCRYPTED_ENTRY));
return false;
}
// Check for supported compression method
if (header.compressionMethod !== C.METHOD_STORE && header.compressionMethod !== C.METHOD_DEFLATE) {
this.emitError(C.createZipError(`Unsupported compression method: ${header.compressionMethod}`, C.ZipErrorCode.UNSUPPORTED_METHOD));
return false;
}
// Consume header from buffer
this.buffer.skip(header.headerSize);
this.currentHeader = header;
// Determine how to handle file data
if (header.hasDataDescriptor) {
// Sizes unknown - need to handle specially
this.bytesRemaining = -1;
} else {
this.bytesRemaining = header.compressedSize;
}
// Create entry stream
this.createAndEmitEntryStream();
return true;
}
/**
* Create and emit entry stream
*/ createAndEmitEntryStream() {
const header = this.currentHeader;
if (!header) return;
// Create output stream (paused to prevent data loss before consumer attaches)
const entryStream = createEntryStream();
this.currentStream = entryStream;
// Initialize CRC state
this.runningCrc = 0;
this.expectedCrc = header.crc32;
// For data descriptor entries, we need to buffer for boundary scanning
if (header.hasDataDescriptor) {
this.compressedChunks = [];
this.compressionHandler = null;
} else if (header.compressedSize === 0) {
// No data to decompress - end stream immediately
this.compressedChunks = null;
this.compressionHandler = null;
entryStream.end();
} else {
// Known size with data: use compression handlers for streaming
this.compressedChunks = null;
const handlerOptions = {
outputStream: entryStream,
onComplete: ()=>this.onCompressionComplete(),
onError: (err)=>this.emitError(err),
verifyCrc: this.options.verifyCrc
};
if (header.compressionMethod === C.METHOD_DEFLATE) {
this.compressionHandler = new DeflateStreamHandler(handlerOptions);
} else {
this.compressionHandler = new StoreHandler(handlerOptions);
}
}
// Lock until consumer calls next()
this.locked = true;
this.state = State.FILE_DATA;
this.emit('entry', header, entryStream, ()=>this.unlock());
}
/**
* Called when compression handler completes (async for DEFLATE)
*/ onCompressionComplete() {
if (this.currentStream) {
this.currentStream.end();
this.currentStream = null;
}
// Clean up compression handler
if (this.compressionHandler) {
this.compressionHandler.destroy();
this.compressionHandler = null;
}
this.currentHeader = null;
this.state = State.SIGNATURE;
// Resume processing to handle next entry
this.process();
}
/**
* Process file data
*/ processFileData() {
const header = this.currentHeader;
if (!header) return false;
if (header.hasDataDescriptor) {
// Unknown size - handle based on compression method
if (header.compressionMethod === C.METHOD_DEFLATE) {
return this.processDeflateDataDescriptor();
}
return this.processStoreDataDescriptor();
}
// Known size - simple case
return this.processKnownSizeData();
}
/**
* Process file data when size is known
*/ processKnownSizeData() {
if (this.bytesRemaining <= 0) {
return this.finishKnownSizeEntry();
}
const available = Math.min(this.buffer.length, this.bytesRemaining);
if (available === 0) {
return false;
}
const chunk = this.buffer.consume(available);
this.bytesRemaining -= available;
// Use compression handler for known-size entries
if (this.compressionHandler) {
this.compressionHandler.write(chunk);
}
if (this.bytesRemaining <= 0) {
return this.finishKnownSizeEntry();
}
return true;
}
/**
* Finish a known-size entry
*/ finishKnownSizeEntry() {
var _this_compressionHandler;
// If compression handler is waiting for async completion, wait
if ((_this_compressionHandler = this.compressionHandler) === null || _this_compressionHandler === void 0 ? void 0 : _this_compressionHandler.isWaiting()) {
return false;
}
// Use compression handler's finish method (handles CRC verification)
if (this.compressionHandler) {
const result = this.compressionHandler.finish(this.expectedCrc);
// If async, return false to wait for onCompressionComplete callback
return result.continue;
}
// No compression handler means we shouldn't be here for known-size entries
this.finishEntry();
return true;
}
/**
* Process DEFLATE data with data descriptor
*
* Since we don't know the compressed size upfront, we buffer data and scan
* for boundary signatures (next entry or central directory) to find where
* the compressed data ends. Once found, we inflate the data.
*/ processDeflateDataDescriptor() {
var _this_options_maxDataDescriptorBuffer, _ref;
var _this_currentHeader;
// Initialize buffer for compressed data
if (this.compressedChunks === null) {
this.compressedChunks = [];
this.compressedChunksSize = 0;
}
if (this.buffer.length === 0) {
return false;
}
// Consume into our accumulator
const chunk = this.buffer.consume(this.buffer.length);
this.compressedChunks.push(chunk);
this.compressedChunksSize += chunk.length;
// Check memory limit (default 100MB)
const maxBuffer = (_this_options_maxDataDescriptorBuffer = this.options.maxDataDescriptorBuffer) !== null && _this_options_maxDataDescriptorBuffer !== void 0 ? _this_options_maxDataDescriptorBuffer : 104857600;
if (maxBuffer > 0 && this.compressedChunksSize > maxBuffer) {
this.emitError(C.createZipError(`Data descriptor entry exceeds buffer limit: ${this.compressedChunksSize} > ${maxBuffer}`, C.ZipErrorCode.BUFFER_OVERFLOW));
return false;
}
// Combine all chunks to search for boundaries
const combined = Buffer.concat(this.compressedChunks);
// Find boundary using DataDescriptorParser
const isZip64 = (_ref = (_this_currentHeader = this.currentHeader) === null || _this_currentHeader === void 0 ? void 0 : _this_currentHeader.isZip64) !== null && _ref !== void 0 ? _ref : false;
const boundary = findDeflateBoundary(combined, isZip64);
if (!boundary) {
// No boundary found yet - keep buffering
// Store combined buffer for efficiency
this.compressedChunks = [
combined
];
return false;
}
// Compressed data is from 0 to dataEnd
const compressedData = combined.slice(0, boundary.dataEnd);
// Data descriptor + rest goes back to buffer for normal parsing
const remainder = combined.slice(boundary.dataEnd);
this.buffer.prepend(remainder);
// Clean up compressed chunks since we've extracted what we need
this.compressedChunks = null;
this.compressedChunksSize = 0;
// Decompress and emit to consumer
try {
const decompressed = inflateRaw(compressedData);
this.finishDeflateEntry(decompressed);
} catch (err) {
this.emitError(err);
return false;
}
return true;
}
/**
* Complete a DEFLATE data descriptor entry after decompression
*/ finishDeflateEntry(decompressed) {
// Calculate CRC of decompressed data for verification after data descriptor is parsed
if (this.options.verifyCrc !== false) {
this.runningCrc = crc32(decompressed);
}
if (this.currentStream) {
this.currentStream.write(decompressed);
this.currentStream.end();
this.currentStream = null;
}
// Move to data descriptor parsing
this.state = State.DATA_DESCRIPTOR;
}
/**
* Process STORE data with data descriptor
*
* STORE has no internal end markers, so we need to scan for the
* data descriptor signature or next local header.
*/ processStoreDataDescriptor() {
var _ref;
var _this_currentHeader;
const isZip64 = (_ref = (_this_currentHeader = this.currentHeader) === null || _this_currentHeader === void 0 ? void 0 : _this_currentHeader.isZip64) !== null && _ref !== void 0 ? _ref : false;
const dataEnd = findStoreDataEnd(this.buffer, isZip64);
if (dataEnd < 0) {
// Haven't found end yet - emit all but a safety buffer
// Keep enough bytes to detect signatures
const safetyBuffer = getSafetyBufferSize();
if (this.buffer.length > safetyBuffer) {
const toEmit = this.buffer.length - safetyBuffer;
const chunk = this.buffer.consume(toEmit);
// Track CRC as data is emitted
if (this.options.verifyCrc !== false) {
this.runningCrc = crc32(chunk, this.runningCrc);
}
if (this.currentStream) {
this.currentStream.write(chunk);
}
}
return false;
}
// Emit file data up to the descriptor
if (dataEnd > 0) {
const chunk = this.buffer.consume(dataEnd);
// Calculate CRC for STORE data descriptor entries
if (this.options.verifyCrc !== false) {
this.runningCrc = crc32(chunk, this.runningCrc);
}
if (this.currentStream) {
this.currentStream.write(chunk);
this.currentStream.end();
this.currentStream = null;
}
}
this.state = State.DATA_DESCRIPTOR;
return true;
}
/**
* Process data descriptor
*/ processDataDescriptor() {
const header = this.currentHeader;
if (!header) return false;
const isZip64 = header.isZip64;
// Data descriptors are small (12-24 bytes), always use slice() to avoid copying large BufferLists
const maxDescriptorSize = isZip64 ? 32 : 24; // Safe upper bound
const buf = this.buffer.slice(0, Math.min(this.buffer.length, maxDescriptorSize));
const descriptor = parseDataDescriptor(buf, 0, isZip64);
if (!descriptor) {
return false; // Need more data
}
// Verify CRC using the CRC from data descriptor
if (this.options.verifyCrc !== false) {
if (this.runningCrc !== descriptor.crc32) {
this.emitError(C.createZipError(`CRC32 mismatch: expected ${descriptor.crc32.toString(16)}, got ${this.runningCrc.toString(16)}`, C.ZipErrorCode.CRC_MISMATCH));
return false;
}
}
// Consume descriptor
this.buffer.skip(descriptor.size);
// Finish the entry
this.finishEntry();
return true;
}
/**
* Finish current entry
*/ finishEntry() {
if (this.currentStream) {
this.currentStream.end();
this.currentStream = null;
}
// Clean up compression handler
if (this.compressionHandler) {
this.compressionHandler.destroy();
this.compressionHandler = null;
}
// Clean up data descriptor buffers
this.compressedChunks = null;
this.compressedChunksSize = 0;
// Reset CRC state
this.runningCrc = 0;
this.expectedCrc = 0;
this.currentHeader = null;
this.state = State.SIGNATURE;
}
/**
* Unlock and continue processing
*/ unlock() {
this.locked = false;
this.process();
}
/**
* Emit error and stop
*/ emitError(err) {
this.state = State.FINISHED;
// Propagate error to current entry stream so consumers receive it
// Uses emitStreamError which handles both immediate and deferred emission
if (this.currentStream) {
const stream = this.currentStream;
this.currentStream = null;
emitStreamError(stream, err);
}
// Clean up state
if (this.compressionHandler) {
this.compressionHandler.destroy();
this.compressionHandler = null;
}
this.compressedChunks = null;
this.compressedChunksSize = 0;
this.currentHeader = null;
this.emit('error', err);
}
/**
* Signal completion
*/ finish() {
if (this.state === State.FINISHED) return;
this.state = State.FINISHED;
this.emit('finish');
}
constructor(options = {}){
super();
this.options = options;
this.buffer = new BufferList();
this.state = State.SIGNATURE;
this.currentHeader = null;
this.currentStream = null;
this.bytesRemaining = 0;
this.locked = false;
this.ended = false;
this.compressionHandler = null;
this.compressedChunks = null;
this.compressedChunksSize = 0;
this.runningCrc = 0;
this.expectedCrc = 0;
}
};
// =============================================================================
// ZipExtract Class
// =============================================================================
export { ZipExtract as default };