isbinaryfile
Version:
Detects if a file is binary in Node.js. Similar to Perl's -B.
101 lines (100 loc) • 4.11 kB
JavaScript
const MAX_BYTES = 512;
/**
* Detect UTF-16 without BOM by analyzing null byte patterns.
* UTF-16LE: ASCII chars have nulls at odd positions (e.g., 't\0e\0s\0t\0')
* UTF-16BE: ASCII chars have nulls at even positions (e.g., '\0t\0e\0s\0t')
*/
export function detectUtf16NoBom(fileBuffer, bytesRead) {
if (bytesRead < 4)
return null;
const scanLength = Math.min(bytesRead, MAX_BYTES);
let nullsAtEven = 0;
let nullsAtOdd = 0;
for (let i = 0; i < scanLength; i++) {
if (fileBuffer[i] === 0x00) {
if (i % 2 === 0)
nullsAtEven++;
else
nullsAtOdd++;
}
}
const totalNulls = nullsAtEven + nullsAtOdd;
// For UTF-16 ASCII text, roughly 30-70% of total bytes should be nulls
// (pure ASCII = 50%, mixed with non-ASCII = less)
if (totalNulls > scanLength * 0.3 && totalNulls < scanLength * 0.7) {
// UTF-16LE: nulls at odd positions (high bytes of little-endian chars)
if (nullsAtOdd > nullsAtEven * 3)
return 'utf-16le';
// UTF-16BE: nulls at even positions (high bytes of big-endian chars)
if (nullsAtEven > nullsAtOdd * 3)
return 'utf-16be';
}
return null;
}
/**
* Check if the buffer is valid text for the given encoding hint.
* Returns true if it's valid text (not binary), false if binary.
*/
export function isTextWithEncodingHint(fileBuffer, bytesRead, encoding) {
const scanLength = Math.min(bytesRead, MAX_BYTES);
// Handle UTF-16 hints
if (encoding === 'utf-16' || encoding === 'utf-16le' || encoding === 'utf-16be') {
// For UTF-16, null bytes are expected, so we just check for control characters
for (let i = 0; i < scanLength; i += 2) {
const byte1 = fileBuffer[i];
const byte2 = i + 1 < scanLength ? fileBuffer[i + 1] : 0;
// In UTF-16, check for problematic control characters
// Allow common ones: tab (0x09), newline (0x0A), carriage return (0x0D)
if (encoding === 'utf-16le' || encoding === 'utf-16') {
// LE: low byte first
if (byte2 === 0 && byte1 < 0x20 && byte1 !== 0x09 && byte1 !== 0x0A && byte1 !== 0x0D && byte1 !== 0x00) {
return false;
}
}
if (encoding === 'utf-16be' || encoding === 'utf-16') {
// BE: high byte first
if (byte1 === 0 && byte2 < 0x20 && byte2 !== 0x09 && byte2 !== 0x0A && byte2 !== 0x0D && byte2 !== 0x00) {
return false;
}
}
}
return true;
}
// Handle Latin-1 / ISO-8859-1 hints
if (encoding === 'latin1' || encoding === 'iso-8859-1') {
for (let i = 0; i < scanLength; i++) {
const byte = fileBuffer[i];
// Null byte is never valid in Latin-1 text files
if (byte === 0x00)
return false;
// Control characters except TAB, LF, CR are suspicious
if (byte < 0x20 && byte !== 0x09 && byte !== 0x0A && byte !== 0x0D) {
return false;
}
// Bytes 0x80-0xFF are all valid Latin-1 characters, so no check needed
}
return true;
}
// Handle CJK encodings
if (encoding === 'cjk' ||
encoding === 'big5' ||
encoding === 'gb2312' ||
encoding === 'gbk' ||
encoding === 'euc-kr' ||
encoding === 'shift-jis') {
// For CJK, we trust the user's hint and only check for definitive binary indicators
for (let i = 0; i < scanLength; i++) {
const byte = fileBuffer[i];
// Null byte is binary indicator even in CJK
if (byte === 0x00)
return false;
// Control characters (except TAB, LF, CR) suggest binary
if (byte < 0x20 && byte !== 0x09 && byte !== 0x0A && byte !== 0x0D) {
return false;
}
}
return true;
}
// Unknown encoding, fall through to default behavior
return false;
}