UNPKG

string-punctuation-tokenizer

Version:

Small library that provides functions to tokenize a string into an array of words with or without punctuation

72 lines (61 loc) 6.99 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.normalizer = exports.normalizerDestructive = exports.normalizationsDestructive = exports._spaceNonSemanticGlyphs = exports._greekNonSemanticGlyphs = exports._hebrewNonSemanticGlyphs = void 0; // Constants: // NOTE: in UHB, maqqef 05BE is followed by 2060, word joiner. // Therefore, we should NOT strip maqqef to match tokenization, which splits on nonword characters. var _hebrewNonSemanticGlyphs = [{ inputs: [/[\u0591-\u05AF\u05BD\u05C0\u05C3-\u05C5\u2060]/gi], output: '' }]; // https://unicode-table.com/en/#combining-diacritical-marks exports._hebrewNonSemanticGlyphs = _hebrewNonSemanticGlyphs; var _greekNonSemanticGlyphs = [{ inputs: [/[\u0300-\u0362\u0374-\u0375\u037a\u0384-\u0385\u0387]/g], output: '' }]; exports._greekNonSemanticGlyphs = _greekNonSemanticGlyphs; var _spaceNonSemanticGlyphs = [{ inputs: [/(\u200B)/g], output: '' }, { inputs: [/\s+/g], output: ' ' }]; exports._spaceNonSemanticGlyphs = _spaceNonSemanticGlyphs; var normalizationsDestructive = [].concat(_hebrewNonSemanticGlyphs).concat(_greekNonSemanticGlyphs).concat(_spaceNonSemanticGlyphs); /** * @param {String} string - The string to normalize * @param {[{inputs:[RegExp], output:String}]} normalizations - Normalization Objects to perform the replace with * @return {String} - The normalized string */ exports.normalizationsDestructive = normalizationsDestructive; var normalizerDestructive = function normalizerDestructive(string) { var normalizations = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : normalizationsDestructive; var _string = normalizer(string); if (string && normalizations) { _string = _string.slice(0); normalizations.forEach(function (_ref) { var inputs = _ref.inputs, output = _ref.output; inputs.forEach(function (input) { _string = _string.replace(input, output); }); }); } return _string; }; /** * Normalize a string: standard Javascript normalization, providing default form. * @return {String} normalized string. */ exports.normalizerDestructive = normalizerDestructive; var normalizer = function normalizer() { var text = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : ''; var form = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'NFKD'; text = text.normalize(form); return text; }; exports.normalizer = normalizer; //# sourceMappingURL=data:application/json;charset=utf-8;base64,eyJ2ZXJzaW9uIjozLCJzb3VyY2VzIjpbIi4uL3NyYy9ub3JtYWxpemVycy5qcyJdLCJuYW1lcyI6WyJfaGVicmV3Tm9uU2VtYW50aWNHbHlwaHMiLCJpbnB1dHMiLCJvdXRwdXQiLCJfZ3JlZWtOb25TZW1hbnRpY0dseXBocyIsIl9zcGFjZU5vblNlbWFudGljR2x5cGhzIiwibm9ybWFsaXphdGlvbnNEZXN0cnVjdGl2ZSIsImNvbmNhdCIsIm5vcm1hbGl6ZXJEZXN0cnVjdGl2ZSIsInN0cmluZyIsIm5vcm1hbGl6YXRpb25zIiwiX3N0cmluZyIsIm5vcm1hbGl6ZXIiLCJzbGljZSIsImZvckVhY2giLCJpbnB1dCIsInJlcGxhY2UiLCJ0ZXh0IiwiZm9ybSIsIm5vcm1hbGl6ZSJdLCJtYXBwaW5ncyI6Ijs7Ozs7O0FBQUE7QUFDQTtBQUNBO0FBQ08sSUFBTUEsd0JBQXdCLEdBQUcsQ0FDcEM7QUFBRUMsRUFBQUEsTUFBTSxFQUFFLENBQUMsa0RBQUQsQ0FBVjtBQUFnRUMsRUFBQUEsTUFBTSxFQUFFO0FBQXhFLENBRG9DLENBQWpDLEMsQ0FHUDs7O0FBQ08sSUFBTUMsdUJBQXVCLEdBQUcsQ0FDbkM7QUFBRUYsRUFBQUEsTUFBTSxFQUFFLENBQUMsd0RBQUQsQ0FBVjtBQUFzRUMsRUFBQUEsTUFBTSxFQUFFO0FBQTlFLENBRG1DLENBQWhDOztBQUdBLElBQU1FLHVCQUF1QixHQUFHLENBQ25DO0FBQUVILEVBQUFBLE1BQU0sRUFBRSxDQUFDLFdBQUQsQ0FBVjtBQUF5QkMsRUFBQUEsTUFBTSxFQUFFO0FBQWpDLENBRG1DLEVBRW5DO0FBQUVELEVBQUFBLE1BQU0sRUFBRSxDQUFDLE1BQUQsQ0FBVjtBQUFvQkMsRUFBQUEsTUFBTSxFQUFFO0FBQTVCLENBRm1DLENBQWhDOztBQU1BLElBQU1HLHlCQUF5QixHQUFHLEdBQ3BDQyxNQURvQyxDQUM3Qk4sd0JBRDZCLEVBRXBDTSxNQUZvQyxDQUU3QkgsdUJBRjZCLEVBR3BDRyxNQUhvQyxDQUc3QkYsdUJBSDZCLENBQWxDO0FBTVA7Ozs7Ozs7O0FBS08sSUFBTUcscUJBQXFCLEdBQUcsU0FBeEJBLHFCQUF3QixDQUFDQyxNQUFELEVBQXdEO0FBQUEsTUFBL0NDLGNBQStDLHVFQUE5QkoseUJBQThCOztBQUN6RixNQUFJSyxPQUFPLEdBQUdDLFVBQVUsQ0FBQ0gsTUFBRCxDQUF4Qjs7QUFFQSxNQUFJQSxNQUFNLElBQUlDLGNBQWQsRUFBOEI7QUFDMUJDLElBQUFBLE9BQU8sR0FBR0EsT0FBTyxDQUFDRSxLQUFSLENBQWMsQ0FBZCxDQUFWO0FBQ0FILElBQUFBLGNBQWMsQ0FBQ0ksT0FBZixDQUF1QixnQkFBd0I7QUFBQSxVQUFyQlosTUFBcUIsUUFBckJBLE1BQXFCO0FBQUEsVUFBYkMsTUFBYSxRQUFiQSxNQUFhO0FBQzNDRCxNQUFBQSxNQUFNLENBQUNZLE9BQVAsQ0FBZSxVQUFDQyxLQUFELEVBQVc7QUFDdEJKLFFBQUFBLE9BQU8sR0FBR0EsT0FBTyxDQUFDSyxPQUFSLENBQWdCRCxLQUFoQixFQUF1QlosTUFBdkIsQ0FBVjtBQUNILE9BRkQ7QUFHSCxLQUpEO0FBS0g7O0FBRUQsU0FBT1EsT0FBUDtBQUNILENBYk07QUFlUDs7Ozs7Ozs7QUFJTyxJQUFNQyxVQUFVLEdBQUcsU0FBYkEsVUFBYSxHQUE4QjtBQUFBLE1BQTdCSyxJQUE2Qix1RUFBdEIsRUFBc0I7QUFBQSxNQUFsQkMsSUFBa0IsdUVBQVgsTUFBVztBQUNwREQsRUFBQUEsSUFBSSxHQUFHQSxJQUFJLENBQUNFLFNBQUwsQ0FBZUQsSUFBZixDQUFQO0FBQ0EsU0FBT0QsSUFBUDtBQUNILENBSE0iLCJzb3VyY2VzQ29udGVudCI6WyIvLyBDb25zdGFudHM6XG4vLyBOT1RFOiBpbiBVSEIsIG1hcXFlZiAwNUJFIGlzIGZvbGxvd2VkIGJ5IDIwNjAsIHdvcmQgam9pbmVyLlxuLy8gVGhlcmVmb3JlLCB3ZSBzaG91bGQgTk9UIHN0cmlwIG1hcXFlZiB0byBtYXRjaCB0b2tlbml6YXRpb24sIHdoaWNoIHNwbGl0cyBvbiBub253b3JkIGNoYXJhY3RlcnMuXG5leHBvcnQgY29uc3QgX2hlYnJld05vblNlbWFudGljR2x5cGhzID0gW1xuICAgIHsgaW5wdXRzOiBbL1tcXHUwNTkxLVxcdTA1QUZcXHUwNUJEXFx1MDVDMFxcdTA1QzMtXFx1MDVDNVxcdTIwNjBdL2dpXSwgb3V0cHV0OiAnJyB9LFxuXTtcbi8vIGh0dHBzOi8vdW5pY29kZS10YWJsZS5jb20vZW4vI2NvbWJpbmluZy1kaWFjcml0aWNhbC1tYXJrc1xuZXhwb3J0IGNvbnN0IF9ncmVla05vblNlbWFudGljR2x5cGhzID0gW1xuICAgIHsgaW5wdXRzOiBbL1tcXHUwMzAwLVxcdTAzNjJcXHUwMzc0LVxcdTAzNzVcXHUwMzdhXFx1MDM4NC1cXHUwMzg1XFx1MDM4N10vZ10sIG91dHB1dDogJycgfSxcbl07XG5leHBvcnQgY29uc3QgX3NwYWNlTm9uU2VtYW50aWNHbHlwaHMgPSBbXG4gICAgeyBpbnB1dHM6IFsvKFxcdTIwMEIpL2ddLCBvdXRwdXQ6ICcnIH0sXG4gICAgeyBpbnB1dHM6IFsvXFxzKy9nXSwgb3V0cHV0OiAnICcgfSxcbl07XG5cblxuZXhwb3J0IGNvbnN0IG5vcm1hbGl6YXRpb25zRGVzdHJ1Y3RpdmUgPSBbXVxuICAgIC5jb25jYXQoX2hlYnJld05vblNlbWFudGljR2x5cGhzKVxuICAgIC5jb25jYXQoX2dyZWVrTm9uU2VtYW50aWNHbHlwaHMpXG4gICAgLmNvbmNhdChfc3BhY2VOb25TZW1hbnRpY0dseXBocyk7XG5cblxuLyoqXG4gKiBAcGFyYW0ge1N0cmluZ30gc3RyaW5nIC0gVGhlIHN0cmluZyB0byBub3JtYWxpemVcbiAqIEBwYXJhbSB7W3tpbnB1dHM6W1JlZ0V4cF0sIG91dHB1dDpTdHJpbmd9XX0gbm9ybWFsaXphdGlvbnMgLSBOb3JtYWxpemF0aW9uIE9iamVjdHMgdG8gcGVyZm9ybSB0aGUgcmVwbGFjZSB3aXRoXG4gKiBAcmV0dXJuIHtTdHJpbmd9IC0gVGhlIG5vcm1hbGl6ZWQgc3RyaW5nXG4gKi9cbmV4cG9ydCBjb25zdCBub3JtYWxpemVyRGVzdHJ1Y3RpdmUgPSAoc3RyaW5nLCBub3JtYWxpemF0aW9ucyA9IG5vcm1hbGl6YXRpb25zRGVzdHJ1Y3RpdmUpID0+IHtcbiAgICBsZXQgX3N0cmluZyA9IG5vcm1hbGl6ZXIoc3RyaW5nKTtcblxuICAgIGlmIChzdHJpbmcgJiYgbm9ybWFsaXphdGlvbnMpIHtcbiAgICAgICAgX3N0cmluZyA9IF9zdHJpbmcuc2xpY2UoMCk7XG4gICAgICAgIG5vcm1hbGl6YXRpb25zLmZvckVhY2goKHsgaW5wdXRzLCBvdXRwdXQgfSkgPT4ge1xuICAgICAgICAgICAgaW5wdXRzLmZvckVhY2goKGlucHV0KSA9PiB7XG4gICAgICAgICAgICAgICAgX3N0cmluZyA9IF9zdHJpbmcucmVwbGFjZShpbnB1dCwgb3V0cHV0KTtcbiAgICAgICAgICAgIH0pO1xuICAgICAgICB9KTtcbiAgICB9XG5cbiAgICByZXR1cm4gX3N0cmluZztcbn1cblxuLyoqXG4gKiBOb3JtYWxpemUgYSBzdHJpbmc6IHN0YW5kYXJkIEphdmFzY3JpcHQgbm9ybWFsaXphdGlvbiwgcHJvdmlkaW5nIGRlZmF1bHQgZm9ybS5cbiAqIEByZXR1cm4ge1N0cmluZ30gbm9ybWFsaXplZCBzdHJpbmcuXG4gKi9cbmV4cG9ydCBjb25zdCBub3JtYWxpemVyID0gKHRleHQgPSAnJywgZm9ybSA9ICdORktEJykgPT4ge1xuICAgIHRleHQgPSB0ZXh0Lm5vcm1hbGl6ZShmb3JtKTtcbiAgICByZXR1cm4gdGV4dDtcbn07XG4iXX0=