UNPKG

node-vntokenizer

Version:

Tokenizer for Vietnamese in Nodejs and Javascript

67 lines (66 loc) 5.73 kB
module.exports = { "numbersign": /#/, "ampersand": /&/, "date_mm-dd-yy": /(0*[1-9]|1[012])-(0*[1-9]|[12][0-9]|3[01])-\d\d/, "date_mm/dd/yy": /(0*[1-9]|1[012])\/(0*[1-9]|[12][0-9]|3[01])\/\d\d/, "date_mm.dd.yy": /(0*[1-9]|1[012])\.(0*[1-9]|[12][0-9]|3[01])\.\d\d/, "date_dd-mm-yy": /([12][0-9]|3[01]|0*[1-9])-(1[012]||0*[1-9])-\d\d/, "date_dd/mm/yy": /([12][0-9]|3[01]|0*[1-9])\/(1[012]||0*[1-9])\/\d\d/, "date_dd.mm.yy": /([12][0-9]|3[01]|0*[1-9])[\.](1[012]||0*[1-9])[\.]\d\d/, "date_dd-mm-yyyy": /([12][0-9]|3[01]|0*[1-9])-(1[012]||0*[1-9])-(19|20)\d\d/, "date_dd/mm/yyyy": /([12][0-9]|3[01]|0*[1-9])\/(1[012]||0*[1-9])\/(19|20)\d\d/, "date_dd.mm.yyyy": /([12][0-9]|3[01]|0*[1-9])\.(1[012]||0*[1-9])\.(19|20)\d\d/, "date_dd-mm": /(0*[1-9]|[12][0-9]|3[01])[-\/\.](1[012]|0*[1-9])/, "date_mm-yy": /(0*[1-9]|1[012])[-\/\.]\d\d/, "date_mm-yyyy": /(0*[1-9]|1[012])[-\/\.](19|20)\d\d/, "date_yyyy": /(19|20)\d\d/, "date_mm-dd-yyyy": /(0*[1-9]|1[012])-([12][0-9]|3[01]|0*[1-9])-(19|20)\d\d/, "date_mm/dd/yyyy": /(0*[1-9]|1[012])\/([12][0-9]|3[01]|0*[1-9])\/(19|20)\d\d/, "date_mm.dd.yyyy": /(0*[1-9]|1[012])\.([12][0-9]|3[01]|0*[1-9])\.(19|20)\d\d/, "date_yyyy-mm-dd": /(19|20)\d\d-(0*[1-9]|1[012])-([12][0-9]|3[01]|0*[1-9])/, "date_yyyy/mm/dd": /(19|20)\d\d\/(0*[1-9]|1[012])\/([12][0-9]|3[01]|0*[1-9])/, "date_yyyy.mm.dd": /(19|20)\d\d\.(0*[1-9]|1[012])\.([12][0-9]|3[01]|0*[1-9])/, "hhmmss": /([0-1]\d|[2][0-3]):[0-5]\d:[0-5]\d/, "percent": /([0-9]*[\.,])?[0-9]+%/, "name1": /[A-ZÁÂĐÐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]*/, "name2": /([A-ZÁÂĐÐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]*)(\s+[A-ZÁÂĐÐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)*/, //"phrase": /([AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐÐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ])?([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\s])*([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz])*/, "phrase": /([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)/, "allcaps": /([AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐÐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ]+)(\s*[AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐÐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ]+)*[^aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\/\)\(\?!\.;:,\-"']/, "fraction": /(\d+)\/(\d+)/, "email": /(\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3})/, "return": /(^$)/, "fslash": /\//, "langle": /</, "xmltags": /<\/*\w*>/, "equal": /=/, "rangle": />/, "aroba": /@/, "number1": /[+]?([0-9]*)?[0-9]+([\.,]\d+)*/, "number2": /[+]?([0-9]*)?[0-9]+([\.,]\d+)*(\s|tỉ|tỷ|triệu|ngàn|nghìn|trăm|chục)*/, "degree": /[-+]?([0-9]*[\.,])?[0-9]+°/, "ponctuation": /[\\?!\\.:;,\-"']/, "dollar": /\$/, "lparen": /\(/, "rparen": /\)/, "asterisk": /\*/, "plus": /\+/, "minus": /\-/, "ellipsis": /\.\.\./, "residual": /\W/, "lbracket": /\[/, "bslash": /\\/, "rbracket": /\]/, "entity0": /\d+([\.,]\d+)*[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+\d+$/, "entity1": /[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+(\d)*$/, "entity2": /[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\d]+([\.\-/][\dA-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)*[\dA-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+/, "space": /\s+/, "word": /\w/, "lcbrace": /\{/, "rcbrace": /\}/, "underscore": /_/, "pound": /£/, //number: /[0-9]*\.[0-9]+|[0-9]+/ig, // space: /\s+/ig, unblank: /\S/, };