UNPKG

isbinaryfile

Version:

Detects if a file is binary in Node.js. Similar to Perl's -B.

250 lines (249 loc) 8.12 kB
import { statSync, openSync, readSync, closeSync } from 'node:fs'; import { open, stat } from 'node:fs/promises'; import { detectUtf16NoBom, isTextWithEncodingHint } from './encoding.js'; const MAX_BYTES = 512; const UTF8_BOUNDARY_RESERVE = 3; // A very basic non-exception raising reader. Read bytes and // at the end use hasError() to check whether this worked. class Reader { fileBuffer; size; offset; error; constructor(fileBuffer, size) { this.fileBuffer = fileBuffer; this.size = size; this.offset = 0; this.error = false; } hasError() { return this.error; } nextByte() { if (this.offset === this.size || this.hasError()) { this.error = true; return 0xff; } return this.fileBuffer[this.offset++]; } next(len) { // Prevent massive array allocation by checking bounds first if (len < 0 || len > this.size - this.offset) { this.error = true; return []; } const n = new Array(); for (let i = 0; i < len; i++) { // Stop reading if an error occurred if (this.error) { return n; } n[i] = this.nextByte(); } return n; } } // Read a Google Protobuf var(iable)int from the buffer. function readProtoVarInt(reader) { let idx = 0; let varInt = 0; while (!reader.hasError()) { const b = reader.nextByte(); varInt = varInt | ((b & 0x7f) << (7 * idx)); if ((b & 0x80) === 0) { break; } if (idx >= 10) { // Varint can be between 1 and 10 bytes. This is too large. reader.error = true; break; } idx++; } return varInt; } // Attempt to taste a full Google Protobuf message. function readProtoMessage(reader) { const varInt = readProtoVarInt(reader); const wireType = varInt & 0x7; switch (wireType) { case 0: readProtoVarInt(reader); return true; case 1: reader.next(8); return true; case 2: const len = readProtoVarInt(reader); reader.next(len); return true; case 5: reader.next(4); return true; } return false; } // Check whether this seems to be a valid protobuf file. function isBinaryProto(fileBuffer, totalBytes) { const reader = new Reader(fileBuffer, totalBytes); let numMessages = 0; while (true) { // Definitely not a valid protobuf if (!readProtoMessage(reader) && !reader.hasError()) { return false; } // Short read? if (reader.hasError()) { break; } numMessages++; } return numMessages > 0; } export async function isBinaryFile(file, options) { if (isString(file)) { const fileStat = await stat(file); isStatFile(fileStat); const fileHandle = await open(file, 'r'); try { const allocBuffer = Buffer.alloc(MAX_BYTES + UTF8_BOUNDARY_RESERVE); const { bytesRead } = await fileHandle.read(allocBuffer, 0, MAX_BYTES + UTF8_BOUNDARY_RESERVE, 0); return isBinaryCheck(allocBuffer, bytesRead, options); } finally { await fileHandle.close(); } } else { const size = options?.size !== undefined ? options.size : file.length; return isBinaryCheck(file, size, options); } } export function isBinaryFileSync(file, options) { if (isString(file)) { const fileStat = statSync(file); isStatFile(fileStat); const fileDescriptor = openSync(file, 'r'); const allocBuffer = Buffer.alloc(MAX_BYTES + UTF8_BOUNDARY_RESERVE); const bytesRead = readSync(fileDescriptor, allocBuffer, 0, MAX_BYTES + UTF8_BOUNDARY_RESERVE, 0); closeSync(fileDescriptor); return isBinaryCheck(allocBuffer, bytesRead, options); } else { const size = options?.size !== undefined ? options.size : file.length; return isBinaryCheck(file, size, options); } } function isBinaryCheck(fileBuffer, bytesRead, options) { // empty file. no clue what it is. if (bytesRead === 0) { return false; } let suspiciousBytes = 0; const totalBytes = Math.min(bytesRead, MAX_BYTES + UTF8_BOUNDARY_RESERVE); const scanBytes = Math.min(totalBytes, MAX_BYTES); // UTF-8 BOM if (bytesRead >= 3 && fileBuffer[0] === 0xef && fileBuffer[1] === 0xbb && fileBuffer[2] === 0xbf) { return false; } // UTF-32 BOM if (bytesRead >= 4 && fileBuffer[0] === 0x00 && fileBuffer[1] === 0x00 && fileBuffer[2] === 0xfe && fileBuffer[3] === 0xff) { return false; } // UTF-32 LE BOM if (bytesRead >= 4 && fileBuffer[0] === 0xff && fileBuffer[1] === 0xfe && fileBuffer[2] === 0x00 && fileBuffer[3] === 0x00) { return false; } // GB BOM if (bytesRead >= 4 && fileBuffer[0] === 0x84 && fileBuffer[1] === 0x31 && fileBuffer[2] === 0x95 && fileBuffer[3] === 0x33) { return false; } if (totalBytes >= 5 && fileBuffer.slice(0, 5).toString() === '%PDF-') { /* PDF. This is binary. */ return true; } // UTF-16 BE BOM if (bytesRead >= 2 && fileBuffer[0] === 0xfe && fileBuffer[1] === 0xff) { return false; } // UTF-16 LE BOM if (bytesRead >= 2 && fileBuffer[0] === 0xff && fileBuffer[1] === 0xfe) { return false; } // Handle encoding hints - if provided, use specialized validation if (options?.encoding) { return !isTextWithEncodingHint(fileBuffer, bytesRead, options.encoding); } // Auto-detect UTF-16 without BOM by analyzing null byte patterns const utf16Detected = detectUtf16NoBom(fileBuffer, bytesRead); if (utf16Detected) { // Detected UTF-16 pattern, validate as text return !isTextWithEncodingHint(fileBuffer, bytesRead, utf16Detected); } for (let i = 0; i < scanBytes; i++) { if (fileBuffer[i] === 0) { // NULL byte--it's binary! return true; } else if ((fileBuffer[i] < 7 || fileBuffer[i] > 14) && (fileBuffer[i] < 32 || fileBuffer[i] > 127)) { // UTF-8 detection if (fileBuffer[i] >= 0xc0 && fileBuffer[i] <= 0xdf && i + 1 < totalBytes) { i++; if (fileBuffer[i] >= 0x80 && fileBuffer[i] <= 0xbf) { continue; } } else if (fileBuffer[i] >= 0xe0 && fileBuffer[i] <= 0xef && i + 2 < totalBytes) { i++; if (fileBuffer[i] >= 0x80 && fileBuffer[i] <= 0xbf && fileBuffer[i + 1] >= 0x80 && fileBuffer[i + 1] <= 0xbf) { i++; continue; } } else if (fileBuffer[i] >= 0xf0 && fileBuffer[i] <= 0xf7 && i + 3 < totalBytes) { i++; if (fileBuffer[i] >= 0x80 && fileBuffer[i] <= 0xbf && fileBuffer[i + 1] >= 0x80 && fileBuffer[i + 1] <= 0xbf && fileBuffer[i + 2] >= 0x80 && fileBuffer[i + 2] <= 0xbf) { i += 2; continue; } } suspiciousBytes++; // Read at least 32 fileBuffer before making a decision if (i >= 32 && (suspiciousBytes * 100) / scanBytes > 10) { return true; } } } if ((suspiciousBytes * 100) / scanBytes > 10) { return true; } if (suspiciousBytes > 1 && isBinaryProto(fileBuffer, scanBytes)) { return true; } return false; } function isString(x) { return typeof x === 'string'; } function isStatFile(stat) { if (!stat.isFile()) { throw new Error(`Path provided was not a file!`); } }