node-vntokenizer
Version:
Tokenizer for Vietnamese in Nodejs and Javascript
67 lines (66 loc) • 5.73 kB
JavaScript
module.exports = {
"numbersign": /#/,
"ampersand": /&/,
"date_mm-dd-yy": /(0*[1-9]|1[012])-(0*[1-9]|[12][0-9]|3[01])-\d\d/,
"date_mm/dd/yy": /(0*[1-9]|1[012])\/(0*[1-9]|[12][0-9]|3[01])\/\d\d/,
"date_mm.dd.yy": /(0*[1-9]|1[012])\.(0*[1-9]|[12][0-9]|3[01])\.\d\d/,
"date_dd-mm-yy": /([12][0-9]|3[01]|0*[1-9])-(1[012]||0*[1-9])-\d\d/,
"date_dd/mm/yy": /([12][0-9]|3[01]|0*[1-9])\/(1[012]||0*[1-9])\/\d\d/,
"date_dd.mm.yy": /([12][0-9]|3[01]|0*[1-9])[\.](1[012]||0*[1-9])[\.]\d\d/,
"date_dd-mm-yyyy": /([12][0-9]|3[01]|0*[1-9])-(1[012]||0*[1-9])-(19|20)\d\d/,
"date_dd/mm/yyyy": /([12][0-9]|3[01]|0*[1-9])\/(1[012]||0*[1-9])\/(19|20)\d\d/,
"date_dd.mm.yyyy": /([12][0-9]|3[01]|0*[1-9])\.(1[012]||0*[1-9])\.(19|20)\d\d/,
"date_dd-mm": /(0*[1-9]|[12][0-9]|3[01])[-\/\.](1[012]|0*[1-9])/,
"date_mm-yy": /(0*[1-9]|1[012])[-\/\.]\d\d/,
"date_mm-yyyy": /(0*[1-9]|1[012])[-\/\.](19|20)\d\d/,
"date_yyyy": /(19|20)\d\d/,
"date_mm-dd-yyyy": /(0*[1-9]|1[012])-([12][0-9]|3[01]|0*[1-9])-(19|20)\d\d/,
"date_mm/dd/yyyy": /(0*[1-9]|1[012])\/([12][0-9]|3[01]|0*[1-9])\/(19|20)\d\d/,
"date_mm.dd.yyyy": /(0*[1-9]|1[012])\.([12][0-9]|3[01]|0*[1-9])\.(19|20)\d\d/,
"date_yyyy-mm-dd": /(19|20)\d\d-(0*[1-9]|1[012])-([12][0-9]|3[01]|0*[1-9])/,
"date_yyyy/mm/dd": /(19|20)\d\d\/(0*[1-9]|1[012])\/([12][0-9]|3[01]|0*[1-9])/,
"date_yyyy.mm.dd": /(19|20)\d\d\.(0*[1-9]|1[012])\.([12][0-9]|3[01]|0*[1-9])/,
"hhmmss": /([0-1]\d|[2][0-3]):[0-5]\d:[0-5]\d/,
"percent": /([0-9]*[\.,])?[0-9]+%/,
"name1": /[A-ZÁÂĐÐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]*/,
"name2": /([A-ZÁÂĐÐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]*)(\s+[A-ZÁÂĐÐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)*/,
//"phrase": /([AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐÐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ])?([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\s])*([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz])*/,
"phrase": /([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)/,
"allcaps": /([AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐÐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ]+)(\s*[AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐÐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ]+)*[^aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\/\)\(\?!\.;:,\-"']/,
"fraction": /(\d+)\/(\d+)/,
"email": /(\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3})/,
"return": /(^$)/,
"fslash": /\//,
"langle": /</,
"xmltags": /<\/*\w*>/,
"equal": /=/,
"rangle": />/,
"aroba": /@/,
"number1": /[+]?([0-9]*)?[0-9]+([\.,]\d+)*/,
"number2": /[+]?([0-9]*)?[0-9]+([\.,]\d+)*(\s|tỉ|tỷ|triệu|ngàn|nghìn|trăm|chục)*/,
"degree": /[-+]?([0-9]*[\.,])?[0-9]+°/,
"ponctuation": /[\\?!\\.:;,\-"']/,
"dollar": /\$/,
"lparen": /\(/,
"rparen": /\)/,
"asterisk": /\*/,
"plus": /\+/,
"minus": /\-/,
"ellipsis": /\.\.\./,
"residual": /\W/,
"lbracket": /\[/,
"bslash": /\\/,
"rbracket": /\]/,
"entity0": /\d+([\.,]\d+)*[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+\d+$/,
"entity1": /[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+(\d)*$/,
"entity2": /[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\d]+([\.\-/][\dA-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)*[\dA-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+/,
"space": /\s+/,
"word": /\w/,
"lcbrace": /\{/,
"rcbrace": /\}/,
"underscore": /_/,
"pound": /£/,
//number: /[0-9]*\.[0-9]+|[0-9]+/ig,
// space: /\s+/ig,
unblank: /\S/,
};