detect-file-encoding-and-language
Version:
Charset Detector - Detect the encoding and language of text files - Use it in the browser, with Node.js, or via CLI
84 lines (65 loc) • 2.25 kB
JavaScript
const checkUTF = require("./components/checkUTF.js");
const processContent = require("./components/processContent.js");
const checkByteOrderMark = require("./components/checkByteOrderMark.js");
module.exports = (file) => {
return new Promise((resolve, reject) => {
const fileInfo = {
encoding: null,
language: null,
confidence: {
encoding: null,
language: null,
},
};
const data = {};
// Check the byte order mark!
const byteOrderMarkBuffer = new FileReader();
byteOrderMarkBuffer.onload = () => {
const uInt8String = new Uint8Array(byteOrderMarkBuffer.result).slice(0, 4).join(" ");
const byteOrderMark = checkByteOrderMark(uInt8String);
if (byteOrderMark) {
fileInfo.encoding = byteOrderMark;
fileInfo.confidence.encoding = 1;
const byteOrderMarkReader = new FileReader();
byteOrderMarkReader.onload = () => {
data.content = byteOrderMarkReader.result;
resolve(processContent(data, fileInfo));
};
byteOrderMarkReader.onerror = (err) => {
reject(err);
};
byteOrderMarkReader.readAsText(file, fileInfo.encoding);
} else {
// Read with UTF-8 first, then with ISO-8859-1
const utfReader = new FileReader();
utfReader.onload = () => {
const utfContent = utfReader.result;
const utf8 = checkUTF(utfContent);
if (utf8) {
fileInfo.encoding = "UTF-8";
fileInfo.confidence.encoding = 1;
}
if (utf8) {
data.content = utfContent;
resolve(processContent(data, fileInfo));
} else {
const isoReader = new FileReader();
isoReader.onload = () => {
data.content = isoReader.result;
resolve(processContent(data, fileInfo));
};
isoReader.readAsText(file, "ISO-8859-1");
}
};
utfReader.onerror = (err) => {
reject(err);
};
utfReader.readAsText(file, "UTF-8");
}
};
byteOrderMarkBuffer.onerror = (err) => {
reject(err);
};
byteOrderMarkBuffer.readAsArrayBuffer(file);
});
};