compromise
Version:
modest natural language processing
231 lines (223 loc) • 5.62 kB
JavaScript
const hasMinMax = /\{([0-9]+)?(, *[0-9]*)?\}/
const andSign = /&&/
// const hasDash = /\p{Letter}[-–—]\p{Letter}/u
const captureName = new RegExp(/^<\s*(\S+)\s*>/)
/* break-down a match expression into this:
{
word:'',
tag:'',
regex:'',
start:false,
end:false,
negative:false,
anything:false,
greedy:false,
optional:false,
named:'',
choices:[],
}
*/
const titleCase = str => str.charAt(0).toUpperCase() + str.substring(1)
const end = (str) => str.charAt(str.length - 1)
const start = (str) => str.charAt(0)
const stripStart = (str) => str.substring(1)
const stripEnd = (str) => str.substring(0, str.length - 1)
const stripBoth = function (str) {
str = stripStart(str)
str = stripEnd(str)
return str
}
//
const parseToken = function (w, opts) {
let obj = {}
//collect any flags (do it twice)
for (let i = 0; i < 2; i += 1) {
//end-flag
if (end(w) === '$') {
obj.end = true
w = stripEnd(w)
}
//front-flag
if (start(w) === '^') {
obj.start = true
w = stripStart(w)
}
if (end(w) === '?') {
obj.optional = true
w = stripEnd(w)
}
//capture group (this one can span multiple-terms)
if (start(w) === '[' || end(w) === ']') {
obj.group = null
if (start(w) === '[') {
obj.groupStart = true
}
if (end(w) === ']') {
obj.groupEnd = true
}
w = w.replace(/^\[/, '')
w = w.replace(/\]$/, '')
// Use capture group name
if (start(w) === '<') {
const res = captureName.exec(w)
if (res.length >= 2) {
obj.group = res[1]
w = w.replace(res[0], '')
}
}
}
//back-flags
if (end(w) === '+') {
obj.greedy = true
w = stripEnd(w)
}
if (w !== '*' && end(w) === '*' && w !== '\\*') {
obj.greedy = true
w = stripEnd(w)
}
if (start(w) === '!') {
obj.negative = true
// obj.optional = true
w = stripStart(w)
}
//soft-match
if (start(w) === '~' && end(w) === '~' && w.length > 2) {
w = stripBoth(w)
obj.fuzzy = true
obj.min = opts.fuzzy || 0.85
if (/\(/.test(w) === false) {
obj.word = w
return obj
}
}
//regex
if (start(w) === '/' && end(w) === '/') {
w = stripBoth(w)
if (opts.caseSensitive) {
obj.use = 'text'
}
obj.regex = new RegExp(w) //potential vuln - security/detect-non-literal-regexp
return obj
}
// support foo{1,9}
if (hasMinMax.test(w) === true) {
w = w.replace(hasMinMax, (_a, b, c) => {
if (c === undefined) {
// '{3}' Exactly three times
obj.min = Number(b)
obj.max = Number(b)
} else {
c = c.replace(/, */, '')
if (b === undefined) {
// '{,9}' implied zero min
obj.min = 0
obj.max = Number(c)
} else {
// '{2,4}' Two to four times
obj.min = Number(b)
// '{3,}' Three or more times
obj.max = Number(c || 999)
}
}
// use same method as '+'
obj.greedy = true
// 0 as min means the same as '?'
if (!obj.min) {
obj.optional = true
}
return ''
})
}
//wrapped-flags
if (start(w) === '(' && end(w) === ')') {
// support (one && two)
if (andSign.test(w)) {
obj.choices = w.split(andSign)
obj.operator = 'and'
} else {
obj.choices = w.split('|')
obj.operator = 'or'
}
//remove '(' and ')'
obj.choices[0] = stripStart(obj.choices[0])
let last = obj.choices.length - 1
obj.choices[last] = stripEnd(obj.choices[last])
// clean up the results
obj.choices = obj.choices.map(s => s.trim())
obj.choices = obj.choices.filter(s => s)
//recursion alert!
obj.choices = obj.choices.map(str => {
return str.split(/ /g).map(s => parseToken(s, opts))
})
w = ''
}
//root/sense overloaded
if (start(w) === '{' && end(w) === '}') {
w = stripBoth(w)
// obj.sense = w
obj.root = w
if (/\//.test(w)) {
let split = obj.root.split(/\//)
obj.root = split[0]
obj.pos = split[1]
if (obj.pos === 'adj') {
obj.pos = 'Adjective'
}
// titlecase
obj.pos = obj.pos.charAt(0).toUpperCase() + obj.pos.substr(1).toLowerCase()
// add sense-number too
if (split[2] !== undefined) {
obj.sense = split[2]
}
}
return obj
}
//chunks
if (start(w) === '<' && end(w) === '>') {
w = stripBoth(w)
obj.chunk = titleCase(w)
obj.greedy = true
return obj
}
if (start(w) === '%' && end(w) === '%') {
w = stripBoth(w)
obj.switch = w
return obj
}
}
//do the actual token content
if (start(w) === '#') {
obj.tag = stripStart(w)
obj.tag = titleCase(obj.tag)
return obj
}
//dynamic function on a term object
if (start(w) === '@') {
obj.method = stripStart(w)
return obj
}
if (w === '.') {
obj.anything = true
return obj
}
//support alone-astrix
if (w === '*') {
obj.anything = true
obj.greedy = true
obj.optional = true
return obj
}
if (w) {
//somehow handle encoded-chars?
w = w.replace('\\*', '*')
w = w.replace('\\.', '.')
if (opts.caseSensitive) {
obj.use = 'text'
} else {
w = w.toLowerCase()
}
obj.word = w
}
return obj
}
export default parseToken