chinese-tokenizer
Version:
Simple algorithm to tokenize Chinese texts into words using CC-CEDICT.
58 lines (41 loc) • 1.22 kB
JavaScript
class Trie {
constructor() {
this.content = {}
}
getKeyObject(key, create = false) {
key = key.toString()
let chars = key === '' ? [key] : Array.from(key)
let obj = this.content
for (let char of chars) {
if (obj[char] == null) {
if (create) obj[char] = {}
else return {}
}
obj = obj[char]
}
return obj
}
get(key) {
let obj = this.getKeyObject(key)
return obj.values || []
}
getPrefix(key) {
let inner = (key, obj = null) => {
if (obj == null) obj = this.getKeyObject(key)
let result = obj.values ? [...obj.values] : []
for (let char in obj) {
if (char === 'values' || obj[char] == null) continue
result.push(...inner(key + char, obj[char]))
}
return result
}
return inner(key)
}
push(key, value) {
let obj = this.getKeyObject(key, true)
if (obj.values == null) obj.values = []
if (!obj.values.includes(value)) obj.values.push(value)
return this
}
}
module.exports = Trie