string-punctuation-tokenizer
Version:
Small library that provides functions to tokenize a string into an array of words with or without punctuation
72 lines (61 loc) • 6.99 kB
JavaScript
Object.defineProperty(exports, "__esModule", {
value: true
});
exports.normalizer = exports.normalizerDestructive = exports.normalizationsDestructive = exports._spaceNonSemanticGlyphs = exports._greekNonSemanticGlyphs = exports._hebrewNonSemanticGlyphs = void 0;
// Constants:
// NOTE: in UHB, maqqef 05BE is followed by 2060, word joiner.
// Therefore, we should NOT strip maqqef to match tokenization, which splits on nonword characters.
var _hebrewNonSemanticGlyphs = [{
inputs: [/[\u0591-\u05AF\u05BD\u05C0\u05C3-\u05C5\u2060]/gi],
output: ''
}]; // https://unicode-table.com/en/#combining-diacritical-marks
exports._hebrewNonSemanticGlyphs = _hebrewNonSemanticGlyphs;
var _greekNonSemanticGlyphs = [{
inputs: [/[\u0300-\u0362\u0374-\u0375\u037a\u0384-\u0385\u0387]/g],
output: ''
}];
exports._greekNonSemanticGlyphs = _greekNonSemanticGlyphs;
var _spaceNonSemanticGlyphs = [{
inputs: [/(\u200B)/g],
output: ''
}, {
inputs: [/\s+/g],
output: ' '
}];
exports._spaceNonSemanticGlyphs = _spaceNonSemanticGlyphs;
var normalizationsDestructive = [].concat(_hebrewNonSemanticGlyphs).concat(_greekNonSemanticGlyphs).concat(_spaceNonSemanticGlyphs);
/**
* @param {String} string - The string to normalize
* @param {[{inputs:[RegExp], output:String}]} normalizations - Normalization Objects to perform the replace with
* @return {String} - The normalized string
*/
exports.normalizationsDestructive = normalizationsDestructive;
var normalizerDestructive = function normalizerDestructive(string) {
var normalizations = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : normalizationsDestructive;
var _string = normalizer(string);
if (string && normalizations) {
_string = _string.slice(0);
normalizations.forEach(function (_ref) {
var inputs = _ref.inputs,
output = _ref.output;
inputs.forEach(function (input) {
_string = _string.replace(input, output);
});
});
}
return _string;
};
/**
* Normalize a string: standard Javascript normalization, providing default form.
* @return {String} normalized string.
*/
exports.normalizerDestructive = normalizerDestructive;
var normalizer = function normalizer() {
var text = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : '';
var form = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'NFKD';
text = text.normalize(form);
return text;
};
exports.normalizer = normalizer;
//# sourceMappingURL=data:application/json;charset=utf-8;base64,eyJ2ZXJzaW9uIjozLCJzb3VyY2VzIjpbIi4uL3NyYy9ub3JtYWxpemVycy5qcyJdLCJuYW1lcyI6WyJfaGVicmV3Tm9uU2VtYW50aWNHbHlwaHMiLCJpbnB1dHMiLCJvdXRwdXQiLCJfZ3JlZWtOb25TZW1hbnRpY0dseXBocyIsIl9zcGFjZU5vblNlbWFudGljR2x5cGhzIiwibm9ybWFsaXphdGlvbnNEZXN0cnVjdGl2ZSIsImNvbmNhdCIsIm5vcm1hbGl6ZXJEZXN0cnVjdGl2ZSIsInN0cmluZyIsIm5vcm1hbGl6YXRpb25zIiwiX3N0cmluZyIsIm5vcm1hbGl6ZXIiLCJzbGljZSIsImZvckVhY2giLCJpbnB1dCIsInJlcGxhY2UiLCJ0ZXh0IiwiZm9ybSIsIm5vcm1hbGl6ZSJdLCJtYXBwaW5ncyI6Ijs7Ozs7O0FBQUE7QUFDQTtBQUNBO0FBQ08sSUFBTUEsd0JBQXdCLEdBQUcsQ0FDcEM7QUFBRUMsRUFBQUEsTUFBTSxFQUFFLENBQUMsa0RBQUQsQ0FBVjtBQUFnRUMsRUFBQUEsTUFBTSxFQUFFO0FBQXhFLENBRG9DLENBQWpDLEMsQ0FHUDs7O0FBQ08sSUFBTUMsdUJBQXVCLEdBQUcsQ0FDbkM7QUFBRUYsRUFBQUEsTUFBTSxFQUFFLENBQUMsd0RBQUQsQ0FBVjtBQUFzRUMsRUFBQUEsTUFBTSxFQUFFO0FBQTlFLENBRG1DLENBQWhDOztBQUdBLElBQU1FLHVCQUF1QixHQUFHLENBQ25DO0FBQUVILEVBQUFBLE1BQU0sRUFBRSxDQUFDLFdBQUQsQ0FBVjtBQUF5QkMsRUFBQUEsTUFBTSxFQUFFO0FBQWpDLENBRG1DLEVBRW5DO0FBQUVELEVBQUFBLE1BQU0sRUFBRSxDQUFDLE1BQUQsQ0FBVjtBQUFvQkMsRUFBQUEsTUFBTSxFQUFFO0FBQTVCLENBRm1DLENBQWhDOztBQU1BLElBQU1HLHlCQUF5QixHQUFHLEdBQ3BDQyxNQURvQyxDQUM3Qk4sd0JBRDZCLEVBRXBDTSxNQUZvQyxDQUU3QkgsdUJBRjZCLEVBR3BDRyxNQUhvQyxDQUc3QkYsdUJBSDZCLENBQWxDO0FBTVA7Ozs7Ozs7O0FBS08sSUFBTUcscUJBQXFCLEdBQUcsU0FBeEJBLHFCQUF3QixDQUFDQyxNQUFELEVBQXdEO0FBQUEsTUFBL0NDLGNBQStDLHVFQUE5QkoseUJBQThCOztBQUN6RixNQUFJSyxPQUFPLEdBQUdDLFVBQVUsQ0FBQ0gsTUFBRCxDQUF4Qjs7QUFFQSxNQUFJQSxNQUFNLElBQUlDLGNBQWQsRUFBOEI7QUFDMUJDLElBQUFBLE9BQU8sR0FBR0EsT0FBTyxDQUFDRSxLQUFSLENBQWMsQ0FBZCxDQUFWO0FBQ0FILElBQUFBLGNBQWMsQ0FBQ0ksT0FBZixDQUF1QixnQkFBd0I7QUFBQSxVQUFyQlosTUFBcUIsUUFBckJBLE1BQXFCO0FBQUEsVUFBYkMsTUFBYSxRQUFiQSxNQUFhO0FBQzNDRCxNQUFBQSxNQUFNLENBQUNZLE9BQVAsQ0FBZSxVQUFDQyxLQUFELEVBQVc7QUFDdEJKLFFBQUFBLE9BQU8sR0FBR0EsT0FBTyxDQUFDSyxPQUFSLENBQWdCRCxLQUFoQixFQUF1QlosTUFBdkIsQ0FBVjtBQUNILE9BRkQ7QUFHSCxLQUpEO0FBS0g7O0FBRUQsU0FBT1EsT0FBUDtBQUNILENBYk07QUFlUDs7Ozs7Ozs7QUFJTyxJQUFNQyxVQUFVLEdBQUcsU0FBYkEsVUFBYSxHQUE4QjtBQUFBLE1BQTdCSyxJQUE2Qix1RUFBdEIsRUFBc0I7QUFBQSxNQUFsQkMsSUFBa0IsdUVBQVgsTUFBVztBQUNwREQsRUFBQUEsSUFBSSxHQUFHQSxJQUFJLENBQUNFLFNBQUwsQ0FBZUQsSUFBZixDQUFQO0FBQ0EsU0FBT0QsSUFBUDtBQUNILENBSE0iLCJzb3VyY2VzQ29udGVudCI6WyIvLyBDb25zdGFudHM6XG4vLyBOT1RFOiBpbiBVSEIsIG1hcXFlZiAwNUJFIGlzIGZvbGxvd2VkIGJ5IDIwNjAsIHdvcmQgam9pbmVyLlxuLy8gVGhlcmVmb3JlLCB3ZSBzaG91bGQgTk9UIHN0cmlwIG1hcXFlZiB0byBtYXRjaCB0b2tlbml6YXRpb24sIHdoaWNoIHNwbGl0cyBvbiBub253b3JkIGNoYXJhY3RlcnMuXG5leHBvcnQgY29uc3QgX2hlYnJld05vblNlbWFudGljR2x5cGhzID0gW1xuICAgIHsgaW5wdXRzOiBbL1tcXHUwNTkxLVxcdTA1QUZcXHUwNUJEXFx1MDVDMFxcdTA1QzMtXFx1MDVDNVxcdTIwNjBdL2dpXSwgb3V0cHV0OiAnJyB9LFxuXTtcbi8vIGh0dHBzOi8vdW5pY29kZS10YWJsZS5jb20vZW4vI2NvbWJpbmluZy1kaWFjcml0aWNhbC1tYXJrc1xuZXhwb3J0IGNvbnN0IF9ncmVla05vblNlbWFudGljR2x5cGhzID0gW1xuICAgIHsgaW5wdXRzOiBbL1tcXHUwMzAwLVxcdTAzNjJcXHUwMzc0LVxcdTAzNzVcXHUwMzdhXFx1MDM4NC1cXHUwMzg1XFx1MDM4N10vZ10sIG91dHB1dDogJycgfSxcbl07XG5leHBvcnQgY29uc3QgX3NwYWNlTm9uU2VtYW50aWNHbHlwaHMgPSBbXG4gICAgeyBpbnB1dHM6IFsvKFxcdTIwMEIpL2ddLCBvdXRwdXQ6ICcnIH0sXG4gICAgeyBpbnB1dHM6IFsvXFxzKy9nXSwgb3V0cHV0OiAnICcgfSxcbl07XG5cblxuZXhwb3J0IGNvbnN0IG5vcm1hbGl6YXRpb25zRGVzdHJ1Y3RpdmUgPSBbXVxuICAgIC5jb25jYXQoX2hlYnJld05vblNlbWFudGljR2x5cGhzKVxuICAgIC5jb25jYXQoX2dyZWVrTm9uU2VtYW50aWNHbHlwaHMpXG4gICAgLmNvbmNhdChfc3BhY2VOb25TZW1hbnRpY0dseXBocyk7XG5cblxuLyoqXG4gKiBAcGFyYW0ge1N0cmluZ30gc3RyaW5nIC0gVGhlIHN0cmluZyB0byBub3JtYWxpemVcbiAqIEBwYXJhbSB7W3tpbnB1dHM6W1JlZ0V4cF0sIG91dHB1dDpTdHJpbmd9XX0gbm9ybWFsaXphdGlvbnMgLSBOb3JtYWxpemF0aW9uIE9iamVjdHMgdG8gcGVyZm9ybSB0aGUgcmVwbGFjZSB3aXRoXG4gKiBAcmV0dXJuIHtTdHJpbmd9IC0gVGhlIG5vcm1hbGl6ZWQgc3RyaW5nXG4gKi9cbmV4cG9ydCBjb25zdCBub3JtYWxpemVyRGVzdHJ1Y3RpdmUgPSAoc3RyaW5nLCBub3JtYWxpemF0aW9ucyA9IG5vcm1hbGl6YXRpb25zRGVzdHJ1Y3RpdmUpID0+IHtcbiAgICBsZXQgX3N0cmluZyA9IG5vcm1hbGl6ZXIoc3RyaW5nKTtcblxuICAgIGlmIChzdHJpbmcgJiYgbm9ybWFsaXphdGlvbnMpIHtcbiAgICAgICAgX3N0cmluZyA9IF9zdHJpbmcuc2xpY2UoMCk7XG4gICAgICAgIG5vcm1hbGl6YXRpb25zLmZvckVhY2goKHsgaW5wdXRzLCBvdXRwdXQgfSkgPT4ge1xuICAgICAgICAgICAgaW5wdXRzLmZvckVhY2goKGlucHV0KSA9PiB7XG4gICAgICAgICAgICAgICAgX3N0cmluZyA9IF9zdHJpbmcucmVwbGFjZShpbnB1dCwgb3V0cHV0KTtcbiAgICAgICAgICAgIH0pO1xuICAgICAgICB9KTtcbiAgICB9XG5cbiAgICByZXR1cm4gX3N0cmluZztcbn1cblxuLyoqXG4gKiBOb3JtYWxpemUgYSBzdHJpbmc6IHN0YW5kYXJkIEphdmFzY3JpcHQgbm9ybWFsaXphdGlvbiwgcHJvdmlkaW5nIGRlZmF1bHQgZm9ybS5cbiAqIEByZXR1cm4ge1N0cmluZ30gbm9ybWFsaXplZCBzdHJpbmcuXG4gKi9cbmV4cG9ydCBjb25zdCBub3JtYWxpemVyID0gKHRleHQgPSAnJywgZm9ybSA9ICdORktEJykgPT4ge1xuICAgIHRleHQgPSB0ZXh0Lm5vcm1hbGl6ZShmb3JtKTtcbiAgICByZXR1cm4gdGV4dDtcbn07XG4iXX0=
;