istextorbinary
Version:
Determine if a filename and/or buffer is text or binary. Smarter detection than the other solutions.
233 lines (232 loc) • 9.02 kB
JavaScript
;
/* eslint no-use-before-define:0 */
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getEncoding = exports.isBinary = exports.isText = void 0;
const pathUtil = __importStar(require("path"));
const textextensions_1 = __importDefault(require("textextensions"));
const binaryextensions_1 = __importDefault(require("binaryextensions"));
/**
* Determine if the filename and/or buffer is text.
* Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
* This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
* The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
* @param filename The filename for the file/buffer if available
* @param buffer The buffer for the file if available
* @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
*/
function isText(filename, buffer) {
// Test extensions
if (filename) {
// Extract filename
const parts = pathUtil.basename(filename).split('.').reverse();
// Cycle extensions
for (const extension of parts) {
if (textextensions_1.default.indexOf(extension) !== -1) {
return true;
}
if (binaryextensions_1.default.indexOf(extension) !== -1) {
return false;
}
}
}
// Fallback to encoding if extension check was not enough
if (buffer) {
return getEncoding(buffer) === 'utf8';
}
// No buffer was provided
return null;
}
exports.isText = isText;
/**
* Determine if the filename and/or buffer is binary.
* Determined by extension checks first (if filename is available), otherwise if unknown extension or no filename, will perform a slower buffer encoding detection.
* This order is done, as extension checks are quicker, and also because encoding checks cannot guarantee accuracy for chars between utf8 and utf16.
* The extension checks are performed using the resources https://github.com/bevry/textextensions and https://github.com/bevry/binaryextensions
* @param filename The filename for the file/buffer if available
* @param buffer The buffer for the file if available
* @returns Will be `null` if neither `filename` nor `buffer` were provided. Otherwise will be a boolean value with the detection result.
*/
function isBinary(filename, buffer) {
const text = isText(filename, buffer);
if (text == null)
return null;
return !text;
}
exports.isBinary = isBinary;
/**
* Get the encoding of a buffer.
* Checks the start, middle, and end of the buffer for characters that are unrecognized within UTF8 encoding.
* History has shown that inspection at all three locations is necessary.
* @returns Will be `null` if `buffer` was not provided. Otherwise will be either `'utf8'` or `'binary'`
*/
function getEncoding(buffer, opts) {
var _a, _b;
// Check
if (!buffer)
return null;
// Prepare
const textEncoding = 'utf8';
const binaryEncoding = 'binary';
const chunkLength = (_a = opts === null || opts === void 0 ? void 0 : opts.chunkLength) !== null && _a !== void 0 ? _a : 24;
let chunkBegin = (_b = opts === null || opts === void 0 ? void 0 : opts.chunkBegin) !== null && _b !== void 0 ? _b : 0;
// Discover
if ((opts === null || opts === void 0 ? void 0 : opts.chunkBegin) == null) {
// Start
let encoding = getEncoding(buffer, { chunkLength, chunkBegin });
if (encoding === textEncoding) {
// Middle
chunkBegin = Math.max(0, Math.floor(buffer.length / 2) - chunkLength);
encoding = getEncoding(buffer, {
chunkLength,
chunkBegin,
});
if (encoding === textEncoding) {
// End
chunkBegin = Math.max(0, buffer.length - chunkLength);
encoding = getEncoding(buffer, {
chunkLength,
chunkBegin,
});
}
}
// Return
return encoding;
}
else {
// Extract
chunkBegin = getChunkBegin(buffer, chunkBegin);
if (chunkBegin === -1) {
return binaryEncoding;
}
const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength));
if (chunkEnd > buffer.length) {
return binaryEncoding;
}
const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd);
// Detect encoding
for (let i = 0; i < contentChunkUTF8.length; ++i) {
const charCode = contentChunkUTF8.charCodeAt(i);
if (charCode === 65533 || charCode <= 8) {
// 8 and below are control characters (e.g. backspace, null, eof, etc.)
// 65533 is the unknown character
// console.log(charCode, contentChunkUTF8[i])
return binaryEncoding;
}
}
// Return
return textEncoding;
}
}
exports.getEncoding = getEncoding;
// ====================================
// The functions below are created to handle multibyte utf8 characters.
// To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding
// @todo add documentation for these
function getChunkBegin(buf, chunkBegin) {
// If it's the beginning, just return.
if (chunkBegin === 0) {
return 0;
}
if (!isLaterByteOfUtf8(buf[chunkBegin])) {
return chunkBegin;
}
let begin = chunkBegin - 3;
if (begin >= 0) {
if (isFirstByteOf4ByteChar(buf[begin])) {
return begin;
}
}
begin = chunkBegin - 2;
if (begin >= 0) {
if (isFirstByteOf4ByteChar(buf[begin]) ||
isFirstByteOf3ByteChar(buf[begin])) {
return begin;
}
}
begin = chunkBegin - 1;
if (begin >= 0) {
// Is it a 4-byte, 3-byte utf8 character?
if (isFirstByteOf4ByteChar(buf[begin]) ||
isFirstByteOf3ByteChar(buf[begin]) ||
isFirstByteOf2ByteChar(buf[begin])) {
return begin;
}
}
return -1;
}
function getChunkEnd(buf, chunkEnd) {
// If it's the end, just return.
if (chunkEnd === buf.length) {
return chunkEnd;
}
let index = chunkEnd - 3;
if (index >= 0) {
if (isFirstByteOf4ByteChar(buf[index])) {
return chunkEnd + 1;
}
}
index = chunkEnd - 2;
if (index >= 0) {
if (isFirstByteOf4ByteChar(buf[index])) {
return chunkEnd + 2;
}
if (isFirstByteOf3ByteChar(buf[index])) {
return chunkEnd + 1;
}
}
index = chunkEnd - 1;
if (index >= 0) {
if (isFirstByteOf4ByteChar(buf[index])) {
return chunkEnd + 3;
}
if (isFirstByteOf3ByteChar(buf[index])) {
return chunkEnd + 2;
}
if (isFirstByteOf2ByteChar(buf[index])) {
return chunkEnd + 1;
}
}
return chunkEnd;
}
function isFirstByteOf4ByteChar(byte) {
// eslint-disable-next-line no-bitwise
return byte >> 3 === 30; // 11110xxx?
}
function isFirstByteOf3ByteChar(byte) {
// eslint-disable-next-line no-bitwise
return byte >> 4 === 14; // 1110xxxx?
}
function isFirstByteOf2ByteChar(byte) {
// eslint-disable-next-line no-bitwise
return byte >> 5 === 6; // 110xxxxx?
}
function isLaterByteOfUtf8(byte) {
// eslint-disable-next-line no-bitwise
return byte >> 6 === 2; // 10xxxxxx?
}