UNPKG

asdfjkl

Version:

Determines if text contains gibberish.

86 lines (85 loc) 4.36 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.averageTransitionProbability = averageTransitionProbability; exports["default"] = _default; exports.train = train; var _lodash = _interopRequireDefault(require("lodash")); var _fs = _interopRequireDefault(require("fs")); var _model = _interopRequireDefault(require("./model.json")); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; } function _toConsumableArray(arr) { return _arrayWithoutHoles(arr) || _iterableToArray(arr) || _unsupportedIterableToArray(arr) || _nonIterableSpread(); } function _nonIterableSpread() { throw new TypeError("Invalid attempt to spread non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); } function _unsupportedIterableToArray(o, minLen) { if (!o) return; if (typeof o === "string") return _arrayLikeToArray(o, minLen); var n = Object.prototype.toString.call(o).slice(8, -1); if (n === "Object" && o.constructor) n = o.constructor.name; if (n === "Map" || n === "Set") return Array.from(o); if (n === "Arguments" || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(n)) return _arrayLikeToArray(o, minLen); } function _iterableToArray(iter) { if (typeof Symbol !== "undefined" && iter[Symbol.iterator] != null || iter["@@iterator"] != null) return Array.from(iter); } function _arrayWithoutHoles(arr) { if (Array.isArray(arr)) return _arrayLikeToArray(arr); } function _arrayLikeToArray(arr, len) { if (len == null || len > arr.length) len = arr.length; for (var i = 0, arr2 = new Array(len); i < len; i++) { arr2[i] = arr[i]; } return arr2; } var encoding = 'utf8'; var defaultTrainingFileName = './data/big.txt'; var modelFileName = './lib/model.json'; var acceptedChars = 'abcdefghijklmnopqrstuvwxyz '; var pos = _lodash["default"].fromPairs(acceptedChars.split('').map(function (_char, i) { return [_char, i]; })); var normalize = function normalize(line) { var lowerCaseChars = _lodash["default"].map(line.split(''), _lodash["default"].toLower); return _lodash["default"].filter(lowerCaseChars, function (_char2) { return _lodash["default"].includes(acceptedChars, _char2); }); }; var ngram = function ngram(n, line, iteratee) { var filtered = normalize(line); for (var start = 0; start <= filtered.length - n; start++) { iteratee.apply(void 0, _toConsumableArray(filtered.slice(start, start + n))); } }; var readLines = function readLines(fileName) { return _lodash["default"].compact(_fs["default"].readFileSync(fileName, encoding).split('\n')); }; function averageTransitionProbability(line, probabilityMatrix) { var logaritmicProbability = 0; var transitionCount = 0; ngram(2, line, function (a, b) { logaritmicProbability += probabilityMatrix[pos[a]][pos[b]]; transitionCount += 1; }); return Math.exp(logaritmicProbability / (transitionCount || 1)); } function train() { var trainingFileName = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : defaultTrainingFileName; var k = acceptedChars.length; var matrix = _toConsumableArray(Array(k)).map(function () { return _toConsumableArray(Array(k)).map(function () { return 10; }); }); var lines = readLines(trainingFileName); lines.forEach(function (line) { ngram(2, line, function (a, b) { matrix[pos[a]][pos[b]] += 1; }); }); matrix.forEach(function (row, i) { var rowSum = _lodash["default"].sum(row); _lodash["default"].range(row.length).forEach(function (j) { matrix[i][j] = Math.log(row[j] / rowSum); }); }); var good = _lodash["default"].map(readLines('./data/good.txt'), function (line) { return averageTransitionProbability(line, matrix); }); var bad = _lodash["default"].map(readLines('./data/bad.txt'), function (line) { return averageTransitionProbability(line, matrix); }); console.assert(_lodash["default"].min(good) > _lodash["default"].max(bad), 'Good Model'); var threshold = (_lodash["default"].min(good) + _lodash["default"].max(bad)) / 2; var content = JSON.stringify({ matrix: matrix, threshold: threshold }); _fs["default"].writeFileSync(modelFileName, content, encoding); } function _default(text) { return averageTransitionProbability(text, _model["default"].matrix) <= _model["default"].threshold; }