UNPKG

node-nlp

Version:

Library for NLU (Natural Language Understanding) done in Node.js

382 lines (340 loc) 9.72 kB
/* * Copyright (c) AXA Shared Services Spain S.A. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ const { TreebankWordTokenizer } = require("../tokenizers"); class BaseStemmer { constructor(tokenizer) { this.cache = {}; this.setCurrent(""); this.tokenizer = tokenizer || new TreebankWordTokenizer(); } setCurrent(value) { this.current = value; this.cursor = 0; this.limit = this.current.length; this.limit_backward = 0; this.bra = this.cursor; this.ket = this.limit; } getCurrent() { return this.current; } copy_from(other) { this.current = other.current; this.cursor = other.cursor; this.limit = other.limit; this.limit_backward = other.limit_backward; this.bra = other.bra; this.ket = other.ket; } in_grouping(s, min, max) { if (this.cursor >= this.limit) return false; let ch = this.current.charCodeAt(this.cursor); if (ch > max || ch < min) return false; ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false; this.cursor++; return true; } in_grouping_b(s, min, max) { if (this.cursor <= this.limit_backward) return false; let ch = this.current.charCodeAt(this.cursor - 1); if (ch > max || ch < min) return false; ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false; this.cursor--; return true; } out_grouping(s, min, max) { if (this.cursor >= this.limit) return false; let ch = this.current.charCodeAt(this.cursor); if (ch > max || ch < min) { this.cursor++; return true; } ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) { this.cursor++; return true; } return false; } out_grouping_b(s, min, max) { if (this.cursor <= this.limit_backward) return false; let ch = this.current.charCodeAt(this.cursor - 1); if (ch > max || ch < min) { this.cursor--; return true; } ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) { this.cursor--; return true; } return false; } in_range(min, max) { if (this.cursor >= this.limit) return false; const ch = this.current.charCodeAt(this.cursor); if (ch > max || ch < min) return false; this.cursor++; return true; } in_range_b(min, max) { if (this.cursor <= this.limit_backward) return false; const ch = this.current.charCodeAt(this.cursor - 1); if (ch > max || ch < min) return false; this.cursor--; return true; } out_range(min, max) { if (this.cursor >= this.limit) return false; const ch = this.current.charCodeAt(this.cursor); if (!(ch > max || ch < min)) return false; this.cursor++; return true; } out_range_b(min, max) { if (this.cursor <= this.limit_backward) return false; const ch = this.current.charCodeAt(this.cursor - 1); if (!(ch > max || ch < min)) return false; this.cursor--; return true; } eq_s(s_size, s) { if (this.limit - this.cursor < s_size) return false; if (this.current.slice(this.cursor, this.cursor + s_size) != s) { return false; } this.cursor += s_size; return true; } eq_s_b(s_size, s) { if (this.cursor - this.limit_backward < s_size) return false; if (this.current.slice(this.cursor - s_size, this.cursor) != s) { return false; } this.cursor -= s_size; return true; } eq_v(s) { return this.eq_s(s.length, s); } eq_v_b(s) { return this.eq_s_b(s.length, s); } find_among(v, v_size) { let i = 0; let j = v_size || v.length; const c = this.cursor; const l = this.limit; let common_i = 0; let common_j = 0; let first_key_inspected = false; while (true) { const k = i + ((j - i) >>> 1); let diff = 0; let common = common_i < common_j ? common_i : common_j; // smaller var w = v[k]; var i2; for (i2 = common; i2 < w.s_size; i2++) { if (c + common == l) { diff = -1; break; } diff = this.current.charCodeAt(c + common) - w.s.charCodeAt(i2); if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; // v->s has been inspected if (j == i) break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) break; first_key_inspected = true; } } while (true) { var w = v[i]; if (common_i >= w.s_size) { this.cursor = c + w.s_size; if (w.method == null) { return w.result; } const res = w.method(w.instance); this.cursor = c + w.s_size; if (res) { return w.result; } } i = w.substring_i; if (i < 0) return 0; } return -1; // not reachable } // find_among_b is for backwards processing. Same comments apply find_among_b(v, v_size) { let i = 0; let j = v_size || v.length; const c = this.cursor; const lb = this.limit_backward; let common_i = 0; let common_j = 0; let first_key_inspected = false; while (true) { const k = i + ((j - i) >> 1); let diff = 0; let common = common_i < common_j ? common_i : common_j; var w = v[k]; var i2; for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = this.current.charCodeAt(c - 1 - common) - w.s.charCodeAt(i2); if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } while (true) { var w = v[i]; if (common_i >= w.s_size) { this.cursor = c - w.s_size; if (w.method == null) return w.result; const res = w.method(this); this.cursor = c - w.s_size; if (res) return w.result; } i = w.substring_i; if (i < 0) return 0; } return -1; // not reachable } /* to replace chars between c_bra and c_ket in this.current by the * chars in s. */ replace_s(c_bra, c_ket, s) { const adjustment = s.length - (c_ket - c_bra); this.current = this.current.slice(0, c_bra) + s + this.current.slice(c_ket); this.limit += adjustment; if (this.cursor >= c_ket) this.cursor += adjustment; else if (this.cursor > c_bra) this.cursor = c_bra; return adjustment; } slice_check() { if ( this.bra < 0 || this.bra > this.ket || this.ket > this.limit || this.limit > this.current.length ) { return false; } return true; } slice_from(s) { let result = false; if (this.slice_check()) { this.replace_s(this.bra, this.ket, s); result = true; } return result; } slice_del() { return this.slice_from(""); } insert(c_bra, c_ket, s) { const adjustment = this.replace_s(c_bra, c_ket, s); if (c_bra <= this.bra) this.bra += adjustment; if (c_bra <= this.ket) this.ket += adjustment; } /* Copy the slice into the supplied StringBuffer */ slice_to(s) { let result = ""; if (this.slice_check()) { result = this.current.slice(this.bra, this.ket); } return result; } assign_to(s) { return this.current.slice(0, this.limit); } stem() { return false; } stemWord(word) { let result = this.cache[`.${word}`]; if (result == null) { this.setCurrent(word); this.stem(); result = this.getCurrent(); this.cache[`.${word}`] = result; } return result; } stemWords(words) { const results = []; for (let i = 0; i < words.length; i++) { const word = words[i]; let result = this.cache[`.${word}`]; if (result == null) { this.setCurrent(word); this.stem(); result = this.getCurrent(); this.cache[`.${word}`] = result; } results.push(result); } return results; } tokenizeAndStem(text) { const tokens = []; this.tokenizer.tokenize(text, true).forEach(token => { tokens.push(token.toLowerCase()); }); return this.stemWords(tokens); } } module.exports = BaseStemmer;