asdfjkl
Version:
Determines if text contains gibberish.
86 lines (85 loc) • 4.36 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", {
value: true
});
exports.averageTransitionProbability = averageTransitionProbability;
exports["default"] = _default;
exports.train = train;
var _lodash = _interopRequireDefault(require("lodash"));
var _fs = _interopRequireDefault(require("fs"));
var _model = _interopRequireDefault(require("./model.json"));
function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; }
function _toConsumableArray(arr) { return _arrayWithoutHoles(arr) || _iterableToArray(arr) || _unsupportedIterableToArray(arr) || _nonIterableSpread(); }
function _nonIterableSpread() { throw new TypeError("Invalid attempt to spread non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); }
function _unsupportedIterableToArray(o, minLen) { if (!o) return; if (typeof o === "string") return _arrayLikeToArray(o, minLen); var n = Object.prototype.toString.call(o).slice(8, -1); if (n === "Object" && o.constructor) n = o.constructor.name; if (n === "Map" || n === "Set") return Array.from(o); if (n === "Arguments" || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(n)) return _arrayLikeToArray(o, minLen); }
function _iterableToArray(iter) { if (typeof Symbol !== "undefined" && iter[Symbol.iterator] != null || iter["@@iterator"] != null) return Array.from(iter); }
function _arrayWithoutHoles(arr) { if (Array.isArray(arr)) return _arrayLikeToArray(arr); }
function _arrayLikeToArray(arr, len) { if (len == null || len > arr.length) len = arr.length; for (var i = 0, arr2 = new Array(len); i < len; i++) { arr2[i] = arr[i]; } return arr2; }
var encoding = 'utf8';
var defaultTrainingFileName = './data/big.txt';
var modelFileName = './lib/model.json';
var acceptedChars = 'abcdefghijklmnopqrstuvwxyz ';
var pos = _lodash["default"].fromPairs(acceptedChars.split('').map(function (_char, i) {
return [_char, i];
}));
var normalize = function normalize(line) {
var lowerCaseChars = _lodash["default"].map(line.split(''), _lodash["default"].toLower);
return _lodash["default"].filter(lowerCaseChars, function (_char2) {
return _lodash["default"].includes(acceptedChars, _char2);
});
};
var ngram = function ngram(n, line, iteratee) {
var filtered = normalize(line);
for (var start = 0; start <= filtered.length - n; start++) {
iteratee.apply(void 0, _toConsumableArray(filtered.slice(start, start + n)));
}
};
var readLines = function readLines(fileName) {
return _lodash["default"].compact(_fs["default"].readFileSync(fileName, encoding).split('\n'));
};
function averageTransitionProbability(line, probabilityMatrix) {
var logaritmicProbability = 0;
var transitionCount = 0;
ngram(2, line, function (a, b) {
logaritmicProbability += probabilityMatrix[pos[a]][pos[b]];
transitionCount += 1;
});
return Math.exp(logaritmicProbability / (transitionCount || 1));
}
function train() {
var trainingFileName = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : defaultTrainingFileName;
var k = acceptedChars.length;
var matrix = _toConsumableArray(Array(k)).map(function () {
return _toConsumableArray(Array(k)).map(function () {
return 10;
});
});
var lines = readLines(trainingFileName);
lines.forEach(function (line) {
ngram(2, line, function (a, b) {
matrix[pos[a]][pos[b]] += 1;
});
});
matrix.forEach(function (row, i) {
var rowSum = _lodash["default"].sum(row);
_lodash["default"].range(row.length).forEach(function (j) {
matrix[i][j] = Math.log(row[j] / rowSum);
});
});
var good = _lodash["default"].map(readLines('./data/good.txt'), function (line) {
return averageTransitionProbability(line, matrix);
});
var bad = _lodash["default"].map(readLines('./data/bad.txt'), function (line) {
return averageTransitionProbability(line, matrix);
});
console.assert(_lodash["default"].min(good) > _lodash["default"].max(bad), 'Good Model');
var threshold = (_lodash["default"].min(good) + _lodash["default"].max(bad)) / 2;
var content = JSON.stringify({
matrix: matrix,
threshold: threshold
});
_fs["default"].writeFileSync(modelFileName, content, encoding);
}
function _default(text) {
return averageTransitionProbability(text, _model["default"].matrix) <= _model["default"].threshold;
}