UNPKG

string-punctuation-tokenizer

Version:

Small library that provides functions to tokenize a string into an array of words with or without punctuation

239 lines (192 loc) 23.3 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.classifyTokens = exports.tokenize = exports.tokenizeOrigLang = exports.number_ = exports.greedyNumber = exports.number = exports.whitespace = exports.punctuation = exports.origGreedyWord = exports.greedyWord = exports.origWord = exports.word = exports._greedyNumber = exports._origGreedyWord = exports._greedyWord = exports._origWordOrNumber = exports._wordOrNumber = exports._number = exports._origWord = exports._word = void 0; var _xregexp = _interopRequireDefault(require("xregexp")); var _occurrences2 = require("./occurrences"); var _normalizers = require("./normalizers"); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; } function ownKeys(object, enumerableOnly) { var keys = Object.keys(object); if (Object.getOwnPropertySymbols) { var symbols = Object.getOwnPropertySymbols(object); if (enumerableOnly) symbols = symbols.filter(function (sym) { return Object.getOwnPropertyDescriptor(object, sym).enumerable; }); keys.push.apply(keys, symbols); } return keys; } function _objectSpread(target) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i] != null ? arguments[i] : {}; if (i % 2) { ownKeys(Object(source), true).forEach(function (key) { _defineProperty(target, key, source[key]); }); } else if (Object.getOwnPropertyDescriptors) { Object.defineProperties(target, Object.getOwnPropertyDescriptors(source)); } else { ownKeys(Object(source)).forEach(function (key) { Object.defineProperty(target, key, Object.getOwnPropertyDescriptor(source, key)); }); } } return target; } function _defineProperty(obj, key, value) { if (key in obj) { Object.defineProperty(obj, key, { value: value, enumerable: true, configurable: true, writable: true }); } else { obj[key] = value; } return obj; } // constants var _word = "[\\pL\\pM\\u200D\\u2060]+"; // TRICKY: original languages do not use single quotes so u2019 is considered part of a word exports._word = _word; var _origWord = "[\\pL\\pM\\u200D\\u2060\\u2019]+"; exports._origWord = _origWord; var _number = '[\\pN\\pNd\\pNl\\pNo]+'; exports._number = _number; var _wordOrNumber = '(' + _word + '|' + _number + ')'; exports._wordOrNumber = _wordOrNumber; var _origWordOrNumber = '(' + _origWord + '|' + _number + ')'; exports._origWordOrNumber = _origWordOrNumber; var _greedyWord = '(' + _wordOrNumber + '([-\'’]' + _word + ')+|' + _word + '’?)'; exports._greedyWord = _greedyWord; var _origGreedyWord = '(' + _origWordOrNumber + '([-\'’]' + _origWord + ')+|' + _origWord + '’?)'; exports._origGreedyWord = _origGreedyWord; var _greedyNumber = '(' + _number + '([:.,]?' + _number + ')+|' + _number + ')'; exports._greedyNumber = _greedyNumber; var word = (0, _xregexp["default"])(_word, ''); exports.word = word; var origWord = (0, _xregexp["default"])(_origWord, ''); exports.origWord = origWord; var greedyWord = (0, _xregexp["default"])(_greedyWord, ''); exports.greedyWord = greedyWord; var origGreedyWord = (0, _xregexp["default"])(_origGreedyWord, ''); exports.origGreedyWord = origGreedyWord; var punctuation = (0, _xregexp["default"])('(^\\p{P}|[<>]{2})', ''); exports.punctuation = punctuation; var whitespace = /\s+/; exports.whitespace = whitespace; var number = (0, _xregexp["default"])(_number); exports.number = number; var greedyNumber = (0, _xregexp["default"])(_greedyNumber); // /(\d+([:.,]?\d)+|\d+)/; exports.greedyNumber = greedyNumber; var number_ = (0, _xregexp["default"])(number); exports.number_ = number_; var tokenizeOrigLang = function tokenizeOrigLang(params) { return tokenize(_objectSpread({ parsers: { word: origWord, greedyWord: origGreedyWord, whitespace: whitespace, punctuation: punctuation, number: number } }, params)); }; /** * Tokenize a string into an array of words * @param {Object} params - string to be tokenized * @return {Array} - array of tokenized words/strings */ exports.tokenizeOrigLang = tokenizeOrigLang; var tokenize = function tokenize(_ref) { var _ref$text = _ref.text, text = _ref$text === void 0 ? '' : _ref$text, _ref$includeWords = _ref.includeWords, includeWords = _ref$includeWords === void 0 ? true : _ref$includeWords, _ref$includeNumbers = _ref.includeNumbers, includeNumbers = _ref$includeNumbers === void 0 ? true : _ref$includeNumbers, _ref$includePunctuati = _ref.includePunctuation, includePunctuation = _ref$includePunctuati === void 0 ? false : _ref$includePunctuati, _ref$includeWhitespac = _ref.includeWhitespace, includeWhitespace = _ref$includeWhitespac === void 0 ? false : _ref$includeWhitespac, _ref$includeUnknown = _ref.includeUnknown, includeUnknown = _ref$includeUnknown === void 0 ? false : _ref$includeUnknown, _ref$greedy = _ref.greedy, greedy = _ref$greedy === void 0 ? false : _ref$greedy, _ref$verbose = _ref.verbose, verbose = _ref$verbose === void 0 ? false : _ref$verbose, _ref$occurrences = _ref.occurrences, occurrences = _ref$occurrences === void 0 ? false : _ref$occurrences, _ref$parsers = _ref.parsers, parsers = _ref$parsers === void 0 ? { word: word, greedyWord: greedyWord, whitespace: whitespace, punctuation: punctuation, number: number } : _ref$parsers, _ref$normalize = _ref.normalize, normalize = _ref$normalize === void 0 ? false : _ref$normalize, _ref$normalizations = _ref.normalizations, normalizations = _ref$normalizations === void 0 ? null : _ref$normalizations; var string = text.slice(0); if (normalize) string = (0, _normalizers.normalizer)(string); if (normalize && normalizations) { string = (0, _normalizers.normalizerDestructive)(string, normalizations); } var greedyParsers = _objectSpread({}, parsers, { word: parsers.greedyWord, number: greedyNumber }); var _parsers = greedy ? greedyParsers : parsers; delete _parsers.greedyWord; var tokens = classifyTokens(string, _parsers, 'unknown'); var types = []; if (includeWords) types.push('word'); if (includeNumbers) types.push('number'); if (includeWhitespace) types.push('whitespace'); if (includePunctuation) types.push('punctuation'); if (includeUnknown) types.push('unknown'); tokens = tokens.filter(function (token) { return types.includes(token.type); }); if (occurrences) { tokens = tokens.map(function (token, index) { var _occurrences = (0, _occurrences2.occurrencesInTokens)(tokens, token.token); var _occurrence = (0, _occurrences2.occurrenceInTokens)(tokens, index, token.token); return _objectSpread({}, token, { occurrence: _occurrence, occurrences: _occurrences }); }); } if (verbose) { tokens = tokens.map(function (token) { delete token.matches; return token; }); } else { tokens = tokens.map(function (token) { return token.token; }); } return tokens; }; /** * Tiny tokenizer - https://gist.github.com/borgar/451393 * @param {String} string - string to be tokenized * @param {Object} parsers - { word:/\w+/, whitespace:/\s+/, punctuation:/[^\w\s]/ } * @param {String} deftok - type to label tokens that are not classified with the above parsers * @return {Array} - array of objects => [{ token:"this", type:"word" },{ token:" ", type:"whitespace" }, Object { token:"is", type:"word" }, ... ] **/ exports.tokenize = tokenize; var classifyTokens = function classifyTokens(string, parsers, deftok) { string = !string ? '' : string; // if string is undefined, make it an empty string if (typeof string !== 'string') { throw new Error("tokenizer.tokenize() string is not String: ".concat(string)); } var m; var r; var t; var tokens = []; while (string) { t = null; m = string.length; var key = void 0; for (key in parsers) { if (Object.prototype.hasOwnProperty.call(parsers, key)) { r = parsers[key].exec(string); // try to choose the best match if there are several // where "best" is the closest to the current starting point if (r && r.index < m) { var token = r[0]; t = { token: token, type: key, matches: r.slice(1) }; m = r.index; } } } if (m) { // there is text between last token and currently // matched token - push that out as default or "unknown" tokens.push({ token: string.substr(0, m), type: deftok || 'unknown' }); } if (t) { // push current token onto sequence tokens.push(t); } string = string.substr(m + (t ? t.token.length : 0)); } return tokens; }; exports.classifyTokens = classifyTokens; //# sourceMappingURL=data:application/json;charset=utf-8;base64,{"version":3,"sources":["../src/tokenizers.js"],"names":["_word","_origWord","_number","_wordOrNumber","_origWordOrNumber","_greedyWord","_origGreedyWord","_greedyNumber","word","origWord","greedyWord","origGreedyWord","punctuation","whitespace","number","greedyNumber","number_","tokenizeOrigLang","params","tokenize","parsers","text","includeWords","includeNumbers","includePunctuation","includeWhitespace","includeUnknown","greedy","verbose","occurrences","normalize","normalizations","string","slice","greedyParsers","_parsers","tokens","classifyTokens","types","push","filter","token","includes","type","map","index","_occurrences","_occurrence","occurrence","matches","deftok","Error","m","r","t","length","key","Object","prototype","hasOwnProperty","call","exec","substr"],"mappings":";;;;;;;AAAA;;AACA;;AACA;;;;;;;;;;AAEA;AACO,IAAMA,KAAK,GAAG,2BAAd,C,CACP;;;AACO,IAAMC,SAAS,GAAG,kCAAlB;;AACA,IAAMC,OAAO,GAAG,wBAAhB;;;AACA,IAAMC,aAAa,GAAG,MAAMH,KAAN,GAAc,GAAd,GAAoBE,OAApB,GAA8B,GAApD;;;;AACA,IAAME,iBAAiB,GAAG,MAAMH,SAAN,GAAkB,GAAlB,GAAwBC,OAAxB,GAAkC,GAA5D;;;;AACA,IAAMG,WAAW,GAAG,MAAMF,aAAN,GAAsB,SAAtB,GAAkCH,KAAlC,GAA0C,KAA1C,GAAkDA,KAAlD,GAA0D,KAA9E;;;;AACA,IAAMM,eAAe,GAAG,MAAMF,iBAAN,GAA0B,SAA1B,GAAsCH,SAAtC,GAAkD,KAAlD,GAA0DA,SAA1D,GAAsE,KAA9F;;;;AACA,IAAMM,aAAa,GAAG,MAAML,OAAN,GAAgB,SAAhB,GAA4BA,OAA5B,GAAsC,KAAtC,GAA8CA,OAA9C,GAAwD,GAA9E;;;AACA,IAAMM,IAAI,GAAG,yBAAQR,KAAR,EAAe,EAAf,CAAb;;AACA,IAAMS,QAAQ,GAAG,yBAAQR,SAAR,EAAmB,EAAnB,CAAjB;;AACA,IAAMS,UAAU,GAAG,yBAAQL,WAAR,EAAqB,EAArB,CAAnB;;AACA,IAAMM,cAAc,GAAG,yBAAQL,eAAR,EAAyB,EAAzB,CAAvB;;AACA,IAAMM,WAAW,GAAG,yBAAQ,mBAAR,EAA6B,EAA7B,CAApB;;AACA,IAAMC,UAAU,GAAG,KAAnB;;AACA,IAAMC,MAAM,GAAG,yBAAQZ,OAAR,CAAf;;AACA,IAAMa,YAAY,GAAG,yBAAQR,aAAR,CAArB,C,CAA6C;;;AAC7C,IAAMS,OAAO,GAAG,yBAAQF,MAAR,CAAhB;;;AAGA,IAAMG,gBAAgB,GAAG,SAAnBA,gBAAmB,CAACC,MAAD;AAAA,SAAYC,QAAQ;AAClDC,IAAAA,OAAO,EAAE;AACPZ,MAAAA,IAAI,EAAEC,QADC;AAEPC,MAAAA,UAAU,EAAEC,cAFL;AAGPE,MAAAA,UAAU,EAAVA,UAHO;AAIPD,MAAAA,WAAW,EAAXA,WAJO;AAKPE,MAAAA,MAAM,EAANA;AALO;AADyC,KAQ/CI,MAR+C,EAApB;AAAA,CAAzB;AAWP;;;;;;;;;AAKO,IAAMC,QAAQ,GAAG,SAAXA,QAAW,OAalB;AAAA,uBAZJE,IAYI;AAAA,MAZJA,IAYI,0BAZG,EAYH;AAAA,+BAXJC,YAWI;AAAA,MAXJA,YAWI,kCAXW,IAWX;AAAA,iCAVJC,cAUI;AAAA,MAVJA,cAUI,oCAVa,IAUb;AAAA,mCATJC,kBASI;AAAA,MATJA,kBASI,sCATiB,KASjB;AAAA,mCARJC,iBAQI;AAAA,MARJA,iBAQI,sCARgB,KAQhB;AAAA,iCAPJC,cAOI;AAAA,MAPJA,cAOI,oCAPa,KAOb;AAAA,yBANJC,MAMI;AAAA,MANJA,MAMI,4BANK,KAML;AAAA,0BALJC,OAKI;AAAA,MALJA,OAKI,6BALM,KAKN;AAAA,8BAJJC,WAII;AAAA,MAJJA,WAII,iCAJU,KAIV;AAAA,0BAHJT,OAGI;AAAA,MAHJA,OAGI,6BAHM;AAACZ,IAAAA,IAAI,EAAJA,IAAD;AAAOE,IAAAA,UAAU,EAAVA,UAAP;AAAmBG,IAAAA,UAAU,EAAVA,UAAnB;AAA+BD,IAAAA,WAAW,EAAXA,WAA/B;AAA4CE,IAAAA,MAAM,EAANA;AAA5C,GAGN;AAAA,4BAFJgB,SAEI;AAAA,MAFJA,SAEI,+BAFQ,KAER;AAAA,iCADJC,cACI;AAAA,MADJA,cACI,oCADa,IACb;AACJ,MAAIC,MAAM,GAAGX,IAAI,CAACY,KAAL,CAAW,CAAX,CAAb;AACA,MAAIH,SAAJ,EAAeE,MAAM,GAAG,6BAAWA,MAAX,CAAT;;AACf,MAAIF,SAAS,IAAIC,cAAjB,EAAiC;AAC/BC,IAAAA,MAAM,GAAG,wCAAsBA,MAAtB,EAA8BD,cAA9B,CAAT;AACD;;AAED,MAAMG,aAAa,qBACdd,OADc;AAEjBZ,IAAAA,IAAI,EAAEY,OAAO,CAACV,UAFG;AAGjBI,IAAAA,MAAM,EAAEC;AAHS,IAAnB;;AAKA,MAAMoB,QAAQ,GAAGR,MAAM,GAAGO,aAAH,GAAmBd,OAA1C;;AACA,SAAOe,QAAQ,CAACzB,UAAhB;AACA,MAAI0B,MAAM,GAAGC,cAAc,CAACL,MAAD,EAASG,QAAT,EAAmB,SAAnB,CAA3B;AACA,MAAMG,KAAK,GAAG,EAAd;AACA,MAAIhB,YAAJ,EAAkBgB,KAAK,CAACC,IAAN,CAAW,MAAX;AAClB,MAAIhB,cAAJ,EAAoBe,KAAK,CAACC,IAAN,CAAW,QAAX;AACpB,MAAId,iBAAJ,EAAuBa,KAAK,CAACC,IAAN,CAAW,YAAX;AACvB,MAAIf,kBAAJ,EAAwBc,KAAK,CAACC,IAAN,CAAW,aAAX;AACxB,MAAIb,cAAJ,EAAoBY,KAAK,CAACC,IAAN,CAAW,SAAX;AACpBH,EAAAA,MAAM,GAAGA,MAAM,CAACI,MAAP,CAAc,UAACC,KAAD;AAAA,WAAWH,KAAK,CAACI,QAAN,CAAeD,KAAK,CAACE,IAArB,CAAX;AAAA,GAAd,CAAT;;AACA,MAAId,WAAJ,EAAiB;AACfO,IAAAA,MAAM,GAAGA,MAAM,CAACQ,GAAP,CAAW,UAACH,KAAD,EAAQI,KAAR,EAAkB;AACpC,UAAMC,YAAY,GAAG,uCAAoBV,MAApB,EAA4BK,KAAK,CAACA,KAAlC,CAArB;;AACA,UAAMM,WAAW,GAAG,sCAAmBX,MAAnB,EAA2BS,KAA3B,EAAkCJ,KAAK,CAACA,KAAxC,CAApB;;AACA,+BAAWA,KAAX;AAAkBO,QAAAA,UAAU,EAAED,WAA9B;AAA2ClB,QAAAA,WAAW,EAAEiB;AAAxD;AACD,KAJQ,CAAT;AAKD;;AACD,MAAIlB,OAAJ,EAAa;AACXQ,IAAAA,MAAM,GAAGA,MAAM,CAACQ,GAAP,CAAW,UAACH,KAAD,EAAW;AAC7B,aAAOA,KAAK,CAACQ,OAAb;AACA,aAAOR,KAAP;AACD,KAHQ,CAAT;AAID,GALD,MAKO;AACLL,IAAAA,MAAM,GAAGA,MAAM,CAACQ,GAAP,CAAW,UAACH,KAAD;AAAA,aAAWA,KAAK,CAACA,KAAjB;AAAA,KAAX,CAAT;AACD;;AACD,SAAOL,MAAP;AACD,CAnDM;AAqDP;;;;;;;;;;;AAOO,IAAMC,cAAc,GAAG,SAAjBA,cAAiB,CAACL,MAAD,EAASZ,OAAT,EAAkB8B,MAAlB,EAA6B;AACzDlB,EAAAA,MAAM,GAAI,CAACA,MAAF,GAAY,EAAZ,GAAiBA,MAA1B,CADyD,CACvB;;AAClC,MAAI,OAAOA,MAAP,KAAkB,QAAtB,EAAgC;AAC9B,UAAM,IAAImB,KAAJ,sDAAwDnB,MAAxD,EAAN;AACD;;AACD,MAAIoB,CAAJ;AACA,MAAIC,CAAJ;AACA,MAAIC,CAAJ;AACA,MAAIlB,MAAM,GAAG,EAAb;;AACA,SAAOJ,MAAP,EAAe;AACbsB,IAAAA,CAAC,GAAG,IAAJ;AACAF,IAAAA,CAAC,GAAGpB,MAAM,CAACuB,MAAX;AACA,QAAIC,GAAG,SAAP;;AACA,SAAKA,GAAL,IAAYpC,OAAZ,EAAqB;AACnB,UAAIqC,MAAM,CAACC,SAAP,CAAiBC,cAAjB,CAAgCC,IAAhC,CAAqCxC,OAArC,EAA8CoC,GAA9C,CAAJ,EAAwD;AACtDH,QAAAA,CAAC,GAAGjC,OAAO,CAACoC,GAAD,CAAP,CAAaK,IAAb,CAAkB7B,MAAlB,CAAJ,CADsD,CAEtD;AACA;;AACA,YAAIqB,CAAC,IAAKA,CAAC,CAACR,KAAF,GAAUO,CAApB,EAAwB;AACtB,cAAIX,KAAK,GAAGY,CAAC,CAAC,CAAD,CAAb;AACAC,UAAAA,CAAC,GAAG;AACFb,YAAAA,KAAK,EAALA,KADE;AAEFE,YAAAA,IAAI,EAAEa,GAFJ;AAGFP,YAAAA,OAAO,EAAEI,CAAC,CAACpB,KAAF,CAAQ,CAAR;AAHP,WAAJ;AAKAmB,UAAAA,CAAC,GAAGC,CAAC,CAACR,KAAN;AACD;AACF;AACF;;AACD,QAAIO,CAAJ,EAAO;AACL;AACA;AACAhB,MAAAA,MAAM,CAACG,IAAP,CAAY;AACVE,QAAAA,KAAK,EAAET,MAAM,CAAC8B,MAAP,CAAc,CAAd,EAAiBV,CAAjB,CADG;AAEVT,QAAAA,IAAI,EAAEO,MAAM,IAAI;AAFN,OAAZ;AAID;;AACD,QAAII,CAAJ,EAAO;AACL;AACAlB,MAAAA,MAAM,CAACG,IAAP,CAAYe,CAAZ;AACD;;AACDtB,IAAAA,MAAM,GAAGA,MAAM,CAAC8B,MAAP,CAAcV,CAAC,IAAIE,CAAC,GAAGA,CAAC,CAACb,KAAF,CAAQc,MAAX,GAAoB,CAAzB,CAAf,CAAT;AACD;;AACD,SAAOnB,MAAP;AACD,CA5CM","sourcesContent":["import xRegExp from 'xregexp';\nimport {occurrenceInTokens, occurrencesInTokens} from './occurrences';\nimport {normalizer, normalizerDestructive} from './normalizers';\n\n// constants\nexport const _word = '[\\\\pL\\\\pM\\\\u200D\\\\u2060]+';\n// TRICKY: original languages do not use single quotes so u2019 is considered part of a word\nexport const _origWord = '[\\\\pL\\\\pM\\\\u200D\\\\u2060\\\\u2019]+';\nexport const _number = '[\\\\pN\\\\pNd\\\\pNl\\\\pNo]+';\nexport const _wordOrNumber = '(' + _word + '|' + _number + ')';\nexport const _origWordOrNumber = '(' + _origWord + '|' + _number + ')';\nexport const _greedyWord = '(' + _wordOrNumber + '([-\\'’]' + _word + ')+|' + _word + '’?)';\nexport const _origGreedyWord = '(' + _origWordOrNumber + '([-\\'’]' + _origWord + ')+|' + _origWord + '’?)';\nexport const _greedyNumber = '(' + _number + '([:.,]?' + _number + ')+|' + _number + ')';\nexport const word = xRegExp(_word, '');\nexport const origWord = xRegExp(_origWord, '');\nexport const greedyWord = xRegExp(_greedyWord, '');\nexport const origGreedyWord = xRegExp(_origGreedyWord, '');\nexport const punctuation = xRegExp('(^\\\\p{P}|[<>]{2})', '');\nexport const whitespace = /\\s+/;\nexport const number = xRegExp(_number);\nexport const greedyNumber = xRegExp(_greedyNumber); //  /(\\d+([:.,]?\\d)+|\\d+)/;\nexport const number_ = xRegExp(number);\n\n\nexport const tokenizeOrigLang = (params) => tokenize({\n  parsers: {\n    word: origWord,\n    greedyWord: origGreedyWord,\n    whitespace,\n    punctuation,\n    number,\n  },\n  ...params,\n});\n\n/**\n * Tokenize a string into an array of words\n * @param {Object} params - string to be tokenized\n * @return {Array} - array of tokenized words/strings\n */\nexport const tokenize = ({\n  text = '',\n  includeWords = true,\n  includeNumbers = true,\n  includePunctuation = false,\n  includeWhitespace = false,\n  includeUnknown = false,\n  greedy = false,\n  verbose = false,\n  occurrences = false,\n  parsers = {word, greedyWord, whitespace, punctuation, number},\n  normalize = false,\n  normalizations = null,\n}) => {\n  let string = text.slice(0);\n  if (normalize) string = normalizer(string);\n  if (normalize && normalizations) {\n    string = normalizerDestructive(string, normalizations);\n  }\n\n  const greedyParsers = {\n    ...parsers,\n    word: parsers.greedyWord,\n    number: greedyNumber,\n  };\n  const _parsers = greedy ? greedyParsers : parsers;\n  delete _parsers.greedyWord;\n  let tokens = classifyTokens(string, _parsers, 'unknown');\n  const types = [];\n  if (includeWords) types.push('word');\n  if (includeNumbers) types.push('number');\n  if (includeWhitespace) types.push('whitespace');\n  if (includePunctuation) types.push('punctuation');\n  if (includeUnknown) types.push('unknown');\n  tokens = tokens.filter((token) => types.includes(token.type));\n  if (occurrences) {\n    tokens = tokens.map((token, index) => {\n      const _occurrences = occurrencesInTokens(tokens, token.token);\n      const _occurrence = occurrenceInTokens(tokens, index, token.token);\n      return {...token, occurrence: _occurrence, occurrences: _occurrences};\n    });\n  }\n  if (verbose) {\n    tokens = tokens.map((token) => {\n      delete token.matches;\n      return token;\n    });\n  } else {\n    tokens = tokens.map((token) => token.token);\n  }\n  return tokens;\n};\n\n/**\n * Tiny tokenizer - https://gist.github.com/borgar/451393\n * @param {String} string - string to be tokenized\n * @param {Object} parsers - { word:/\\w+/, whitespace:/\\s+/, punctuation:/[^\\w\\s]/ }\n * @param {String} deftok - type to label tokens that are not classified with the above parsers\n * @return {Array} - array of objects => [{ token:\"this\", type:\"word\" },{ token:\" \", type:\"whitespace\" }, Object { token:\"is\", type:\"word\" }, ... ]\n**/\nexport const classifyTokens = (string, parsers, deftok) => {\n  string = (!string) ? '' : string; // if string is undefined, make it an empty string\n  if (typeof string !== 'string') {\n    throw new Error(`tokenizer.tokenize() string is not String: ${string}`);\n  }\n  let m;\n  let r;\n  let t;\n  let tokens = [];\n  while (string) {\n    t = null;\n    m = string.length;\n    let key;\n    for (key in parsers) {\n      if (Object.prototype.hasOwnProperty.call(parsers, key)) {\n        r = parsers[key].exec(string);\n        // try to choose the best match if there are several\n        // where \"best\" is the closest to the current starting point\n        if (r && (r.index < m)) {\n          let token = r[0];\n          t = {\n            token,\n            type: key,\n            matches: r.slice(1),\n          };\n          m = r.index;\n        }\n      }\n    }\n    if (m) {\n      // there is text between last token and currently\n      // matched token - push that out as default or \"unknown\"\n      tokens.push({\n        token: string.substr(0, m),\n        type: deftok || 'unknown',\n      });\n    }\n    if (t) {\n      // push current token onto sequence\n      tokens.push(t);\n    }\n    string = string.substr(m + (t ? t.token.length : 0));\n  }\n  return tokens;\n};\n\n"]}