detect-file-encoding-and-language
Version:
Charset Detector - Detect the encoding and language of text files - Use it in the browser, with Node.js, or via CLI
44 lines (34 loc) • 1.36 kB
JavaScript
const countAllMatches = require("./processing-content/countAllMatches.js");
const calculateConfidenceScore = require("./processing-content/calculateConfidenceScore.js");
const byteOrderMarkObject = require("../config/byteOrderMarkObject.js");
module.exports = (data, fileInfo) => {
data.languageArr = countAllMatches(data, fileInfo.encoding);
fileInfo.language = data.languageArr.reduce((acc, val) =>
acc.count > val.count ? acc : val
).name;
// "pos" gives us the position in the language array that has the most matches
data.pos = data.languageArr.findIndex(
(elem) => elem.name === fileInfo.language
);
// Determine the encoding
if (!fileInfo.encoding) {
fileInfo.encoding = data.languageArr[data.pos].encoding;
}
const calculations = calculateConfidenceScore(data, fileInfo);
if (fileInfo.confidence.encoding) {
fileInfo.confidence.language = calculations;
} else {
fileInfo.confidence.encoding = calculations;
fileInfo.confidence.language = calculations;
}
// Edge case, when no matches were found
if (!data.languageArr[data.pos].count) {
fileInfo.language = null;
fileInfo.confidence.language = null;
if (!byteOrderMarkObject.some(obj => obj.encoding === fileInfo.encoding)) {
fileInfo.encoding = null;
fileInfo.confidence.encoding = null;
}
}
return fileInfo;
};