tar-iterator
Version:
Extract contents from tar archive type using an iterator API using streams or paths. Use stream interface and pipe transforms to add decompression algorithms
490 lines (489 loc) • 19.7 kB
JavaScript
/**
* TarExtract - Streaming TAR extraction
*
* Event-based TAR parser that emits 'entry' events for each file.
* Node 0.8 compatible.
*
* State Machine:
* ```
* ┌─────────────────────────────────────────────┐
* │ │
* HEADER ─┬─ [file] ────> FILE_DATA ──> PADDING ─────────────────────>──┤
* │ │
* ├─ [gnu-long-path] ──> GNU_LONG_PATH ──> PADDING ──>──────────┤
* │ │
* ├─ [gnu-long-link] ──> GNU_LONG_LINK ──> PADDING ──>──────────┤
* │ │
* ├─ [pax-header] ──> PAX_HEADER ──> PADDING ──>────────────────┤
* │ │
* ├─ [gnu-sparse] ─┬─> SPARSE_EXTENDED ──> SPARSE_DATA ──>──────┤
* │ │ │
* │ └─> SPARSE_DATA ──> PADDING ──>──────────────┤
* │ │
* └─ [null header] ──> END │
* │
* <─────────────────────────────────────────────────────────────┘
* ```
*
* Extension handling:
* - GNU LongPath/LongLink headers store path for NEXT entry
* - PAX headers store attributes for NEXT entry (or all entries if global)
* - Extensions are applied when the actual file header is processed
*
* Events:
* 'entry' (header: TarHeader, stream: Readable, next: () => void)
* 'error' (err: Error)
* 'finish' ()
*/ import { EventEmitter } from 'events';
import { BufferList } from 'extract-base-iterator';
import { BLOCK_SIZE, HEADER_SIZE } from './constants.js';
import EntryStream from './EntryStream.js';
import { applyExtensions, createExtensionState, finalizeExtension } from './Extensions.js';
import { overflow, parseHeader } from './headers.js';
import { parseGnuSparseExtended, parseGnuSparseHeader, SparseStream, sparseDataSize } from './sparse.js';
// Parser states
const STATE_HEADER = 0;
const STATE_FILE_DATA = 1;
const STATE_PADDING = 2;
const STATE_END = 3;
const STATE_GNU_LONG_PATH = 4;
const STATE_GNU_LONG_LINK = 5;
const STATE_PAX_HEADER = 6;
const STATE_SPARSE_EXTENDED = 7;
const STATE_SPARSE_DATA = 8;
let TarExtract = class TarExtract extends EventEmitter {
/**
* Write data to the parser
*/ write(chunk, callback) {
if (this.finished) {
if (callback) callback();
return false;
}
this.buffer.append(chunk);
this._process();
// Emit any pending entry that was parsed during _process()
// This is necessary because _process() may parse new entry headers
// from incoming data, and those entries need to be emitted to listeners
this.resume();
if (callback) callback();
return !this.locked;
}
/**
* Signal end of input
*/ end(callback) {
this.finished = true;
this._process();
// Emit any pending entry before checking for finish
this.resume();
this._maybeFinish();
if (callback) callback();
}
/**
* Emit error to the main stream and any active entry stream
* This prevents tests from hanging when errors occur mid-extraction
*/ _emitError(err) {
// Propagate error to any active entry stream first
const activeStream = this.entryStream || this.sparseStream;
if (activeStream && !activeStream.ended) {
activeStream.emit('error', err);
}
// Then emit to the main extract stream
this.emit('error', err);
}
/**
* Emit 'finish' if appropriate
*/ _maybeFinish() {
// Don't emit finish more than once
if (this.finishEmitted) return;
// Don't emit finish if we have a pending entry
if (this.pendingEntry) return;
// Don't emit finish if not finished yet
if (!this.finished) return;
// Don't emit finish if locked - consumer hasn't called next() yet
// and there may be more entries to process
if (this.locked) return;
// Only emit finish when we're in a terminal state
if (this.state === STATE_HEADER || this.state === STATE_END) {
this.state = STATE_END; // Mark as ended
this.finishEmitted = true;
this.emit('finish');
}
}
/**
* Resume parsing - emit any pending entry
* Call this after setting up 'entry' listeners
*/ resume() {
// Only emit if there are listeners - this prevents entries from being
// lost when resume() is called from write() before listeners are set up
// Use listeners().length for Node 0.8 compatibility (listenerCount added in 0.10)
if (this.pendingEntry && this.listeners('entry').length > 0) {
const entry = this.pendingEntry;
this.pendingEntry = null;
// Clear pending flag so file data can flow
this.pending = false;
// Emit the entry
this.emit('entry', entry.header, entry.stream, entry.next);
// Continue processing file data
this._process();
// Check if we should emit finish now
this._maybeFinish();
}
}
/**
* Process buffered data through state machine
*/ _process() {
// Note: locked/pending only blocks processing NEXT header, not current entry data
if (this.pending) return;
let cont = true;
while(cont){
switch(this.state){
case STATE_HEADER:
// Don't process new headers while locked
if (this.locked) {
cont = false;
} else {
cont = this._processHeader();
}
break;
case STATE_FILE_DATA:
cont = this._processFileData();
break;
case STATE_PADDING:
cont = this._processPadding();
break;
case STATE_GNU_LONG_PATH:
case STATE_GNU_LONG_LINK:
case STATE_PAX_HEADER:
cont = this._processExtensionData();
break;
case STATE_SPARSE_EXTENDED:
cont = this._processSparseExtended();
break;
case STATE_SPARSE_DATA:
cont = this._processSparseData();
break;
case STATE_END:
cont = false;
break;
default:
cont = false;
}
}
}
/**
* Process header state
*/ _processHeader() {
if (!this.buffer.has(HEADER_SIZE)) {
return false; // Need more data
}
const headerBuf = this.buffer.consume(HEADER_SIZE);
// Try to parse header
let header;
try {
header = parseHeader(headerBuf, this.options);
} catch (err) {
this._emitError(err);
this.state = STATE_END;
return false;
}
// Null header means end of archive (empty block)
if (header === null) {
this.state = STATE_END;
this.emit('finish');
return false;
}
this.header = header;
this.paddingRemaining = overflow(header.size);
// Handle GNU/PAX extension headers - collect data silently
if (header.type === 'gnu-long-path') {
this.extState.extensionRemaining = header.size;
this.extState.extensionData = [];
this.state = STATE_GNU_LONG_PATH;
return true; // Continue processing
}
if (header.type === 'gnu-long-link-path') {
this.extState.extensionRemaining = header.size;
this.extState.extensionData = [];
this.state = STATE_GNU_LONG_LINK;
return true; // Continue processing
}
if (header.type === 'pax-header') {
this.extState.extensionRemaining = header.size;
this.extState.extensionData = [];
this.state = STATE_PAX_HEADER;
return true; // Continue processing
}
if (header.type === 'pax-global-header') {
// For global headers, we read them but they apply to all subsequent entries
this.extState.extensionRemaining = header.size;
this.extState.extensionData = [];
this.state = STATE_PAX_HEADER; // Same handling, different application
return true; // Continue processing
}
// Handle GNU sparse files
if (header.type === 'gnu-sparse') {
// Parse sparse info from header
this.sparseInfo = parseGnuSparseHeader(headerBuf);
// Apply extensions (e.g., GNU long path)
applyExtensions(header, this.extState);
// Update header size to real (reconstructed) file size
header.size = this.sparseInfo.realSize;
// If extended sparse headers follow, read them first
if (this.sparseInfo.isExtended) {
this.header = header;
this.state = STATE_SPARSE_EXTENDED;
return true; // Continue processing
}
// No extended headers - set up sparse entry now
return this._setupSparseEntry(header);
}
// Apply any pending GNU/PAX extensions to this entry
applyExtensions(header, this.extState);
// Set up for file data
this.entryRemaining = header.size;
// Create entry stream
this.entryStream = new EntryStream();
// Lock until consumer calls next()
this.locked = true;
this.pending = true;
// Store pending entry (will be emitted when consumer calls resume())
const self = this;
const entryStream = this.entryStream;
const next = function next() {
self._unlock();
};
this.pendingEntry = {
header,
stream: entryStream,
next
};
// If no data, go straight to padding
if (this.entryRemaining === 0) {
this.entryStream.end();
this.entryStream = null;
this.state = this.paddingRemaining > 0 ? STATE_PADDING : STATE_HEADER;
} else {
this.state = STATE_FILE_DATA;
}
return false; // Don't continue processing until unlocked
}
/**
* Process extension data (GNU long path/link, PAX headers)
*/ _processExtensionData() {
if (this.extState.extensionRemaining <= 0) {
// Done collecting extension data - decode and store
const encoding = this.options.filenameEncoding || 'utf8';
finalizeExtension(this.extState, this.state, this.header, encoding);
this.state = this.paddingRemaining > 0 ? STATE_PADDING : STATE_HEADER;
return true;
}
if (this.buffer.length === 0) {
return false; // Need more data
}
// Read as much as we can
const toRead = Math.min(this.extState.extensionRemaining, this.buffer.length);
const data = this.buffer.consume(toRead);
this.extState.extensionRemaining -= toRead;
this.extState.extensionData.push(data);
// Check if done
if (this.extState.extensionRemaining <= 0) {
const encoding = this.options.filenameEncoding || 'utf8';
finalizeExtension(this.extState, this.state, this.header, encoding);
this.state = this.paddingRemaining > 0 ? STATE_PADDING : STATE_HEADER;
}
return true;
}
/**
* Process file data state
*/ _processFileData() {
if (this.entryRemaining <= 0) {
// Done with file data
if (this.entryStream) {
this.entryStream.end();
this.entryStream = null;
}
this.state = this.paddingRemaining > 0 ? STATE_PADDING : STATE_HEADER;
return true;
}
if (this.buffer.length === 0) {
return false; // Need more data
}
// Read as much as we can
const toRead = Math.min(this.entryRemaining, this.buffer.length);
const data = this.buffer.consume(toRead);
this.entryRemaining -= toRead;
// Push to entry stream
if (this.entryStream) {
this.entryStream.push(data);
}
// Check if done
if (this.entryRemaining <= 0) {
if (this.entryStream) {
this.entryStream.end();
this.entryStream = null;
}
this.state = this.paddingRemaining > 0 ? STATE_PADDING : STATE_HEADER;
}
return true;
}
/**
* Process padding state
*/ _processPadding() {
if (this.paddingRemaining <= 0) {
this.state = STATE_HEADER;
return true;
}
if (this.buffer.length === 0) {
return false; // Need more data
}
// Skip padding bytes
const toSkip = Math.min(this.paddingRemaining, this.buffer.length);
this.buffer.consume(toSkip);
this.paddingRemaining -= toSkip;
if (this.paddingRemaining <= 0) {
this.state = STATE_HEADER;
}
return true;
}
/**
* Unlock parser (called by next() callback)
*/ _unlock() {
this.locked = false;
this.pending = false;
this._process();
// After processing, if there's a pending entry, emit it
// (the consumer's listeners are still set up from previous entry)
this.resume();
// Check if we should emit finish (e.g., if end() was called while locked)
this._maybeFinish();
}
/**
* Set up a sparse entry with SparseStream
*/ _setupSparseEntry(header) {
if (!this.sparseInfo) {
this._emitError(new Error('Sparse info not available'));
this.state = STATE_END;
return false;
}
// Calculate actual data size (sum of all sparse entry numbytes)
this.sparseDataRemaining = sparseDataSize(this.sparseInfo.entries);
// Calculate padding for the actual data size
this.paddingRemaining = overflow(this.sparseDataRemaining);
// Create sparse stream for reconstruction
this.sparseStream = new SparseStream(this.sparseInfo.entries, this.sparseInfo.realSize);
// Lock until consumer calls next()
this.locked = true;
this.pending = true;
// Store pending entry (the stream looks like a regular entry to consumers)
const self = this;
const stream = this.sparseStream;
const next = function next() {
self._unlock();
};
// Change header type to 'file' for consumers (they don't need to know it's sparse)
header.type = 'file';
this.pendingEntry = {
header,
stream,
next
};
// Go to sparse data state
if (this.sparseDataRemaining === 0) {
// No data - just holes (all zeros)
this.sparseStream.end();
this.sparseStream = null;
this.sparseInfo = null;
this.state = this.paddingRemaining > 0 ? STATE_PADDING : STATE_HEADER;
} else {
this.state = STATE_SPARSE_DATA;
}
return false; // Don't continue until unlocked
}
/**
* Process extended sparse headers
*/ _processSparseExtended() {
if (!this.buffer.has(BLOCK_SIZE)) {
return false; // Need more data
}
const extBuf = this.buffer.consume(BLOCK_SIZE);
const ext = parseGnuSparseExtended(extBuf);
// Add entries to sparse info
if (this.sparseInfo) {
for(let i = 0; i < ext.entries.length; i++){
this.sparseInfo.entries.push(ext.entries[i]);
}
// Check if more extended headers follow
if (ext.isExtended) {
return true; // Continue reading extended headers
}
}
// Done reading extended headers - set up the sparse entry
if (this.header) {
return this._setupSparseEntry(this.header);
}
// Should not reach here
this._emitError(new Error('Header not available for sparse entry'));
this.state = STATE_END;
return false;
}
/**
* Process sparse file data
*/ _processSparseData() {
if (this.sparseDataRemaining <= 0) {
// Done with sparse data
if (this.sparseStream) {
this.sparseStream.end();
this.sparseStream = null;
}
this.sparseInfo = null;
this.state = this.paddingRemaining > 0 ? STATE_PADDING : STATE_HEADER;
return true;
}
if (this.buffer.length === 0) {
return false; // Need more data
}
// Read as much as we can
const toRead = Math.min(this.sparseDataRemaining, this.buffer.length);
const data = this.buffer.consume(toRead);
this.sparseDataRemaining -= toRead;
// Push to sparse stream for reconstruction
if (this.sparseStream) {
this.sparseStream.push(data);
}
// Check if done
if (this.sparseDataRemaining <= 0) {
if (this.sparseStream) {
this.sparseStream.end();
this.sparseStream = null;
}
this.sparseInfo = null;
this.state = this.paddingRemaining > 0 ? STATE_PADDING : STATE_HEADER;
}
return true;
}
constructor(options){
super(), // Current entry state
this.header = null, this.entryStream = null, this.entryRemaining = 0, this.paddingRemaining = 0, // Backpressure control
this.locked = false, this.pending = false, this.finished = false, this.finishEmitted = false, // Pending entry to emit (waiting for consumer to set up listeners)
this.pendingEntry = null, // GNU sparse file state
this.sparseInfo = null, this.sparseStream = null, this.sparseDataRemaining = 0;
this.buffer = new BufferList();
this.state = STATE_HEADER;
this.options = options || {};
this.extState = createExtensionState();
}
};
/**
* TAR extraction stream
*
* Usage:
* const extract = new TarExtract();
* extract.on('entry', (header, stream, next) => { ... });
* extract.on('error', (err) => { ... });
* extract.on('finish', () => { ... });
* source.on('data', (chunk) => extract.write(chunk));
* source.on('end', () => extract.end());
*
* @internal
* @hidden
*/ export { TarExtract as default };