file-type
Version:
Detect the file type of a file, stream, or data
644 lines (550 loc) • 19.6 kB
JavaScript
import * as Token from 'token-types';
import * as strtok3 from 'strtok3/core';
import {ZipHandler} from '@tokenizer/inflate';
import {
maximumUntrustedSkipSizeInBytes,
ParserHardLimitError,
safeIgnore,
hasUnknownFileSize,
hasExceededUnknownSizeScanBudget,
} from '../parser.js';
const maximumZipEntrySizeInBytes = 1024 * 1024;
const maximumZipEntryCount = 1024;
const maximumZipBufferedReadSizeInBytes = (2 ** 31) - 1;
const maximumZipTextEntrySizeInBytes = maximumZipEntrySizeInBytes;
const recoverableZipErrorMessages = new Set([
'Unexpected signature',
'Encrypted ZIP',
'Expected Central-File-Header signature',
]);
const recoverableZipErrorMessagePrefixes = [
'ZIP entry count exceeds ',
'Unsupported ZIP compression method:',
'ZIP entry compressed data exceeds ',
'ZIP entry decompressed data exceeds ',
'Expected data-descriptor-signature at position ',
];
const recoverableZipErrorCodes = new Set([
'Z_BUF_ERROR',
'Z_DATA_ERROR',
'ERR_INVALID_STATE',
]);
async function decompressDeflateRawWithLimit(data, {maximumLength = maximumZipEntrySizeInBytes} = {}) {
const input = new ReadableStream({
start(controller) {
controller.enqueue(data);
controller.close();
},
});
const output = input.pipeThrough(new DecompressionStream('deflate-raw'));
const reader = output.getReader();
const chunks = [];
let totalLength = 0;
try {
for (;;) {
const {done, value} = await reader.read();
if (done) {
break;
}
totalLength += value.length;
if (totalLength > maximumLength) {
await reader.cancel();
throw new Error(`ZIP entry decompressed data exceeds ${maximumLength} bytes`);
}
chunks.push(value);
}
} finally {
reader.releaseLock();
}
const uncompressedData = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of chunks) {
uncompressedData.set(chunk, offset);
offset += chunk.length;
}
return uncompressedData;
}
function mergeByteChunks(chunks, totalLength) {
const merged = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of chunks) {
merged.set(chunk, offset);
offset += chunk.length;
}
return merged;
}
function getMaximumZipBufferedReadLength(tokenizer) {
const fileSize = tokenizer.fileInfo.size;
const remainingBytes = Number.isFinite(fileSize)
? Math.max(0, fileSize - tokenizer.position)
: Number.MAX_SAFE_INTEGER;
return Math.min(remainingBytes, maximumZipBufferedReadSizeInBytes);
}
function isRecoverableZipError(error) {
if (error instanceof strtok3.EndOfStreamError) {
return true;
}
if (error instanceof ParserHardLimitError) {
return true;
}
if (!(error instanceof Error)) {
return false;
}
if (recoverableZipErrorMessages.has(error.message)) {
return true;
}
if (recoverableZipErrorCodes.has(error.code)) {
return true;
}
for (const prefix of recoverableZipErrorMessagePrefixes) {
if (error.message.startsWith(prefix)) {
return true;
}
}
return false;
}
function canReadZipEntryForDetection(zipHeader, maximumSize = maximumZipEntrySizeInBytes) {
const sizes = [zipHeader.compressedSize, zipHeader.uncompressedSize];
for (const size of sizes) {
if (
!Number.isFinite(size)
|| size < 0
|| size > maximumSize
) {
return false;
}
}
return true;
}
// -- iWork helpers --
function createIWorkZipDetectionState() {
return {
hasDocumentEntry: false,
hasMasterSlideEntry: false,
hasTablesEntry: false,
hasCalculationEngineEntry: false,
};
}
function updateIWorkZipDetectionStateFromFilename(iWorkState, filename) {
if (filename === 'Index/Document.iwa') {
iWorkState.hasDocumentEntry = true;
}
if (filename.startsWith('Index/MasterSlide')) {
iWorkState.hasMasterSlideEntry = true;
}
if (filename.startsWith('Index/Tables/')) {
iWorkState.hasTablesEntry = true;
}
if (filename === 'Index/CalculationEngine.iwa') {
iWorkState.hasCalculationEngineEntry = true;
}
}
function getIWorkFileTypeFromZipEntries(iWorkState) {
if (!iWorkState.hasDocumentEntry) {
return;
}
if (iWorkState.hasMasterSlideEntry) {
return {ext: 'key', mime: 'application/vnd.apple.keynote'};
}
if (iWorkState.hasTablesEntry) {
return {ext: 'numbers', mime: 'application/vnd.apple.numbers'};
}
return {ext: 'pages', mime: 'application/vnd.apple.pages'};
}
// -- OpenXML helpers --
function getFileTypeFromMimeType(mimeType) {
mimeType = mimeType.toLowerCase();
switch (mimeType) {
case 'application/epub+zip':
return {ext: 'epub', mime: mimeType};
case 'application/vnd.oasis.opendocument.text':
return {ext: 'odt', mime: mimeType};
case 'application/vnd.oasis.opendocument.text-template':
return {ext: 'ott', mime: mimeType};
case 'application/vnd.oasis.opendocument.spreadsheet':
return {ext: 'ods', mime: mimeType};
case 'application/vnd.oasis.opendocument.spreadsheet-template':
return {ext: 'ots', mime: mimeType};
case 'application/vnd.oasis.opendocument.presentation':
return {ext: 'odp', mime: mimeType};
case 'application/vnd.oasis.opendocument.presentation-template':
return {ext: 'otp', mime: mimeType};
case 'application/vnd.oasis.opendocument.graphics':
return {ext: 'odg', mime: mimeType};
case 'application/vnd.oasis.opendocument.graphics-template':
return {ext: 'otg', mime: mimeType};
case 'application/vnd.openxmlformats-officedocument.presentationml.slideshow':
return {ext: 'ppsx', mime: mimeType};
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
return {ext: 'xlsx', mime: mimeType};
case 'application/vnd.ms-excel.sheet.macroenabled':
return {ext: 'xlsm', mime: 'application/vnd.ms-excel.sheet.macroenabled.12'};
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.template':
return {ext: 'xltx', mime: mimeType};
case 'application/vnd.ms-excel.template.macroenabled':
return {ext: 'xltm', mime: 'application/vnd.ms-excel.template.macroenabled.12'};
case 'application/vnd.ms-powerpoint.slideshow.macroenabled':
return {ext: 'ppsm', mime: 'application/vnd.ms-powerpoint.slideshow.macroenabled.12'};
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
return {ext: 'docx', mime: mimeType};
case 'application/vnd.ms-word.document.macroenabled':
return {ext: 'docm', mime: 'application/vnd.ms-word.document.macroenabled.12'};
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.template':
return {ext: 'dotx', mime: mimeType};
case 'application/vnd.ms-word.template.macroenabledtemplate':
return {ext: 'dotm', mime: 'application/vnd.ms-word.template.macroenabled.12'};
case 'application/vnd.openxmlformats-officedocument.presentationml.template':
return {ext: 'potx', mime: mimeType};
case 'application/vnd.ms-powerpoint.template.macroenabled':
return {ext: 'potm', mime: 'application/vnd.ms-powerpoint.template.macroenabled.12'};
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
return {ext: 'pptx', mime: mimeType};
case 'application/vnd.ms-powerpoint.presentation.macroenabled':
return {ext: 'pptm', mime: 'application/vnd.ms-powerpoint.presentation.macroenabled.12'};
case 'application/vnd.ms-visio.drawing':
return {ext: 'vsdx', mime: 'application/vnd.visio'};
case 'application/vnd.ms-package.3dmanufacturing-3dmodel+xml':
return {ext: '3mf', mime: 'model/3mf'};
default:
}
}
function createOpenXmlZipDetectionState() {
return {
hasContentTypesEntry: false,
hasParsedContentTypesEntry: false,
isParsingContentTypes: false,
hasUnparseableContentTypes: false,
hasWordDirectory: false,
hasPresentationDirectory: false,
hasSpreadsheetDirectory: false,
hasThreeDimensionalModelEntry: false,
};
}
function updateOpenXmlZipDetectionStateFromFilename(openXmlState, filename) {
if (filename.startsWith('word/')) {
openXmlState.hasWordDirectory = true;
}
if (filename.startsWith('ppt/')) {
openXmlState.hasPresentationDirectory = true;
}
if (filename.startsWith('xl/')) {
openXmlState.hasSpreadsheetDirectory = true;
}
if (
filename.startsWith('3D/')
&& filename.endsWith('.model')
) {
openXmlState.hasThreeDimensionalModelEntry = true;
}
}
function getOpenXmlFileTypeFromDirectoryNames(openXmlState) {
if (openXmlState.hasWordDirectory) {
return {
ext: 'docx',
mime: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
};
}
if (openXmlState.hasPresentationDirectory) {
return {
ext: 'pptx',
mime: 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
};
}
if (openXmlState.hasSpreadsheetDirectory) {
return {
ext: 'xlsx',
mime: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
};
}
if (openXmlState.hasThreeDimensionalModelEntry) {
return {
ext: '3mf',
mime: 'model/3mf',
};
}
}
function getOpenXmlFileTypeFromZipEntries(openXmlState) {
// Only use directory-name heuristic when [Content_Types].xml was present in the archive
// but its handler was skipped (not invoked, not currently running, and not already resolved).
// This avoids guessing from directory names when content-type parsing already gave a definitive answer or failed.
if (
!openXmlState.hasContentTypesEntry
|| openXmlState.hasUnparseableContentTypes
|| openXmlState.isParsingContentTypes
|| openXmlState.hasParsedContentTypesEntry
) {
return;
}
return getOpenXmlFileTypeFromDirectoryNames(openXmlState);
}
function getOpenXmlMimeTypeFromContentTypesXml(xmlContent) {
// We only need the `ContentType="...main+xml"` value, so a small string scan is enough and avoids full XML parsing.
const endPosition = xmlContent.indexOf('.main+xml"');
if (endPosition === -1) {
const mimeType = 'application/vnd.ms-package.3dmanufacturing-3dmodel+xml';
if (xmlContent.includes(`ContentType="${mimeType}"`)) {
return mimeType;
}
return;
}
const truncatedContent = xmlContent.slice(0, endPosition);
const firstQuotePosition = truncatedContent.lastIndexOf('"');
// If no quote is found, `lastIndexOf` returns -1 and this intentionally falls back to the full truncated prefix.
return truncatedContent.slice(firstQuotePosition + 1);
}
const zipDataDescriptorSignature = 0x08_07_4B_50;
const zipDataDescriptorLengthInBytes = 16;
const zipDataDescriptorOverlapLengthInBytes = zipDataDescriptorLengthInBytes - 1;
function findZipDataDescriptorOffset(buffer, bytesConsumed) {
if (buffer.length < zipDataDescriptorLengthInBytes) {
return -1;
}
const lastPossibleDescriptorOffset = buffer.length - zipDataDescriptorLengthInBytes;
for (let index = 0; index <= lastPossibleDescriptorOffset; index++) {
if (
Token.UINT32_LE.get(buffer, index) === zipDataDescriptorSignature
&& Token.UINT32_LE.get(buffer, index + 8) === bytesConsumed + index
) {
return index;
}
}
return -1;
}
async function readZipDataDescriptorEntryWithLimit(zipHandler, {shouldBuffer, maximumLength = maximumZipEntrySizeInBytes} = {}) {
const {syncBuffer} = zipHandler;
const {length: syncBufferLength} = syncBuffer;
const chunks = [];
let bytesConsumed = 0;
for (;;) {
const length = await zipHandler.tokenizer.peekBuffer(syncBuffer, {mayBeLess: true});
const dataDescriptorOffset = findZipDataDescriptorOffset(syncBuffer.subarray(0, length), bytesConsumed);
const retainedLength = dataDescriptorOffset >= 0
? 0
: (
length === syncBufferLength
? Math.min(zipDataDescriptorOverlapLengthInBytes, length - 1)
: 0
);
const chunkLength = dataDescriptorOffset >= 0 ? dataDescriptorOffset : length - retainedLength;
if (chunkLength === 0) {
break;
}
bytesConsumed += chunkLength;
if (bytesConsumed > maximumLength) {
throw new Error(`ZIP entry compressed data exceeds ${maximumLength} bytes`);
}
if (shouldBuffer) {
const data = new Uint8Array(chunkLength);
await zipHandler.tokenizer.readBuffer(data);
chunks.push(data);
} else {
await zipHandler.tokenizer.ignore(chunkLength);
}
if (dataDescriptorOffset >= 0) {
break;
}
}
if (!hasUnknownFileSize(zipHandler.tokenizer)) {
zipHandler.knownSizeDescriptorScannedBytes += bytesConsumed;
}
if (!shouldBuffer) {
return;
}
return mergeByteChunks(chunks, bytesConsumed);
}
function getRemainingZipScanBudget(zipHandler, startOffset) {
if (hasUnknownFileSize(zipHandler.tokenizer)) {
return Math.max(0, maximumUntrustedSkipSizeInBytes - (zipHandler.tokenizer.position - startOffset));
}
return Math.max(0, maximumZipEntrySizeInBytes - zipHandler.knownSizeDescriptorScannedBytes);
}
async function readZipEntryData(zipHandler, zipHeader, {shouldBuffer, maximumDescriptorLength = maximumZipEntrySizeInBytes} = {}) {
if (
zipHeader.dataDescriptor
&& zipHeader.compressedSize === 0
) {
return readZipDataDescriptorEntryWithLimit(zipHandler, {
shouldBuffer,
maximumLength: maximumDescriptorLength,
});
}
if (!shouldBuffer) {
await safeIgnore(zipHandler.tokenizer, zipHeader.compressedSize, {
maximumLength: hasUnknownFileSize(zipHandler.tokenizer) ? maximumZipEntrySizeInBytes : zipHandler.tokenizer.fileInfo.size,
reason: 'ZIP entry compressed data',
});
return;
}
const maximumLength = getMaximumZipBufferedReadLength(zipHandler.tokenizer);
if (
!Number.isFinite(zipHeader.compressedSize)
|| zipHeader.compressedSize < 0
|| zipHeader.compressedSize > maximumLength
) {
throw new Error(`ZIP entry compressed data exceeds ${maximumLength} bytes`);
}
const fileData = new Uint8Array(zipHeader.compressedSize);
await zipHandler.tokenizer.readBuffer(fileData);
return fileData;
}
// Override the default inflate to enforce decompression size limits, since @tokenizer/inflate does not expose a configuration hook for this.
ZipHandler.prototype.inflate = async function (zipHeader, fileData, callback) {
if (zipHeader.compressedMethod === 0) {
return callback(fileData);
}
if (zipHeader.compressedMethod !== 8) {
throw new Error(`Unsupported ZIP compression method: ${zipHeader.compressedMethod}`);
}
const uncompressedData = await decompressDeflateRawWithLimit(fileData, {maximumLength: maximumZipEntrySizeInBytes});
return callback(uncompressedData);
};
ZipHandler.prototype.unzip = async function (fileCallback) {
let stop = false;
let zipEntryCount = 0;
const zipScanStart = this.tokenizer.position;
this.knownSizeDescriptorScannedBytes = 0;
do {
if (hasExceededUnknownSizeScanBudget(this.tokenizer, zipScanStart, maximumUntrustedSkipSizeInBytes)) {
throw new ParserHardLimitError(`ZIP stream probing exceeds ${maximumUntrustedSkipSizeInBytes} bytes`);
}
const zipHeader = await this.readLocalFileHeader();
if (!zipHeader) {
break;
}
zipEntryCount++;
if (zipEntryCount > maximumZipEntryCount) {
throw new Error(`ZIP entry count exceeds ${maximumZipEntryCount}`);
}
const next = fileCallback(zipHeader);
stop = Boolean(next.stop);
await this.tokenizer.ignore(zipHeader.extraFieldLength);
const fileData = await readZipEntryData(this, zipHeader, {
shouldBuffer: Boolean(next.handler),
maximumDescriptorLength: Math.min(maximumZipEntrySizeInBytes, getRemainingZipScanBudget(this, zipScanStart)),
});
if (next.handler) {
await this.inflate(zipHeader, fileData, next.handler);
}
if (zipHeader.dataDescriptor) {
const dataDescriptor = new Uint8Array(zipDataDescriptorLengthInBytes);
await this.tokenizer.readBuffer(dataDescriptor);
if (Token.UINT32_LE.get(dataDescriptor, 0) !== zipDataDescriptorSignature) {
throw new Error(`Expected data-descriptor-signature at position ${this.tokenizer.position - dataDescriptor.length}`);
}
}
if (hasExceededUnknownSizeScanBudget(this.tokenizer, zipScanStart, maximumUntrustedSkipSizeInBytes)) {
throw new ParserHardLimitError(`ZIP stream probing exceeds ${maximumUntrustedSkipSizeInBytes} bytes`);
}
} while (!stop);
};
export async function detectZip(tokenizer) {
let fileType;
const openXmlState = createOpenXmlZipDetectionState();
const iWorkState = createIWorkZipDetectionState();
try {
await new ZipHandler(tokenizer).unzip(zipHeader => {
updateOpenXmlZipDetectionStateFromFilename(openXmlState, zipHeader.filename);
updateIWorkZipDetectionStateFromFilename(iWorkState, zipHeader.filename);
// Early exit for Keynote or Numbers when markers are definitive
if (iWorkState.hasDocumentEntry && (iWorkState.hasMasterSlideEntry || iWorkState.hasTablesEntry)) {
fileType = getIWorkFileTypeFromZipEntries(iWorkState);
return {stop: true};
}
const isOpenXmlContentTypesEntry = zipHeader.filename === '[Content_Types].xml';
const openXmlFileTypeFromEntries = getOpenXmlFileTypeFromZipEntries(openXmlState);
if (
!isOpenXmlContentTypesEntry
&& openXmlFileTypeFromEntries
) {
fileType = openXmlFileTypeFromEntries;
return {
stop: true,
};
}
switch (zipHeader.filename) {
case 'META-INF/mozilla.rsa':
fileType = {
ext: 'xpi',
mime: 'application/x-xpinstall',
};
return {
stop: true,
};
case 'META-INF/MANIFEST.MF':
fileType = {
ext: 'jar',
mime: 'application/java-archive',
};
return {
stop: true,
};
case 'mimetype':
if (!canReadZipEntryForDetection(zipHeader, maximumZipTextEntrySizeInBytes)) {
return {};
}
return {
async handler(fileData) {
// Use TextDecoder to decode the UTF-8 encoded data
const mimeType = new TextDecoder('utf-8').decode(fileData).trim();
fileType = getFileTypeFromMimeType(mimeType);
},
stop: true,
};
case '[Content_Types].xml': {
openXmlState.hasContentTypesEntry = true;
if (!canReadZipEntryForDetection(zipHeader, maximumZipTextEntrySizeInBytes)) {
openXmlState.hasUnparseableContentTypes = true;
return {};
}
openXmlState.isParsingContentTypes = true;
return {
async handler(fileData) {
// Use TextDecoder to decode the UTF-8 encoded data
const xmlContent = new TextDecoder('utf-8').decode(fileData);
const mimeType = getOpenXmlMimeTypeFromContentTypesXml(xmlContent);
if (mimeType) {
fileType = getFileTypeFromMimeType(mimeType);
}
openXmlState.hasParsedContentTypesEntry = true;
openXmlState.isParsingContentTypes = false;
},
stop: true,
};
}
default:
if (/classes\d*\.dex/v.test(zipHeader.filename)) {
fileType = {
ext: 'apk',
mime: 'application/vnd.android.package-archive',
};
return {stop: true};
}
return {};
}
});
} catch (error) {
if (!isRecoverableZipError(error)) {
throw error;
}
if (openXmlState.isParsingContentTypes) {
openXmlState.isParsingContentTypes = false;
openXmlState.hasUnparseableContentTypes = true;
}
// When the stream was truncated before reaching [Content_Types].xml, use directory names as a fallback.
// This handles LibreOffice-created OOXML files where [Content_Types].xml appears after content entries.
if (!fileType && error instanceof strtok3.EndOfStreamError && !openXmlState.hasContentTypesEntry) {
fileType = getOpenXmlFileTypeFromDirectoryNames(openXmlState);
}
}
const iWorkFileType = hasUnknownFileSize(tokenizer)
&& iWorkState.hasDocumentEntry
&& !iWorkState.hasMasterSlideEntry
&& !iWorkState.hasTablesEntry
&& !iWorkState.hasCalculationEngineEntry
? undefined
: getIWorkFileTypeFromZipEntries(iWorkState);
return fileType ?? getOpenXmlFileTypeFromZipEntries(openXmlState) ?? iWorkFileType ?? {
ext: 'zip',
mime: 'application/zip',
};
}