nk-vector
Version:
314 lines • 12.7 kB
JavaScript
const path = require('path');
module.exports.create_one_hot = function (file_url, url_save) {
let cre_oh = require(path.join(__dirname, "/src/Create_onehot.js"))
cre_oh.one_hot(file_url, url_save)
}
module.exports.create_window_words = function (file_url, window_size, url_save) {
let cre_w = require(path.join(__dirname, "/src/Create_windows.js"))
cre_w.window(file_url, window_size, url_save)
}
module.exports.train = function (size_output, url_data_one_hot, url_data_window_words, url_save) {
let train = require(path.join(__dirname, "/src/Run_train.js"))
train.training(size_output, url_data_one_hot, url_data_window_words, url_save)
}
module.exports.build_vec_sentences = function (document, url_vecs_of_words, url_save) {
let fs = require("fs");
let data_vector = fs.readFileSync(url_vecs_of_words, 'utf8')
let wordVecs = JSON.parse(data_vector);
try{
let file_stop_word = fs.readFileSync(path.join(__dirname,"/src/stop_word.txt")).toString();
file_stop_word = file_stop_word.split("\r\n")
function mashup(matrix) {
let result = matrix[0]
for (let i = 1; i < matrix.length; i++) {
for (let j in matrix[i]) {
result[j] += matrix[i][j]
}
}
return result
}
function average(matrix, size, type) {
if (type == "mashup") {
let matrix_mashup = mashup(matrix)
let result = []
for (let i in matrix_mashup) {
result.push(matrix_mashup[i] / size)
}
if (result.length != 0) {
return result
}
} else if (type == "nonmashup") {
let result = []
for (let i in matrix) {
let line = []
for (let j in matrix[i]) {
line.push(matrix[i][j] / size)
}
if (line.length > 0 && line.length == matrix[i].length) {
result.push(line)
}
}
if (result.length != 0) {
return result
}
}
else {
let result = []
for (let i in matrix[0]) {
result.push(matrix[0][i] / matrix[0].length)
}
if (result.length != 0) {
return result
}
}
}
function filter_stop_word(text) {
text = text.split(' ')
text = text.filter(function (value, index, arr) {
return file_stop_word.includes(process(value)) <= 0;
});
let new_text = ''
for (let i in text) {
if (text[i] != '' && text[i].length >= 2) {
new_text += text[i] + ' '
}
}
return new_text.trim()
}
function process(text) {
text = text.replace(/[’“”%&!’#√.*+?,;^${}()`'"|[\]\\//]/g, " ");
text = text.replace(/[0-9]/g, '');
text = text.replace(/(\r\n\t|\n|\r)/gm, " ");
text = text.replace(/[=]/g, " ");
text = text.replace(/[:]/g, " ");
text = text.replace(/[-]/g, " ");
text = text.replace(/[>]/g, " ");
text = text.replace(/[<]/g, " ");
text = text.replace(/[@]/g, " ");
text = text.replace(/\s+/g, ' ')
text = text.replace(/[0-9]/g, ' ');
text = text.replace("\\t ", "");
text = text.replace("\n", "");
text = text.replace("\n\t", "");
text = text.replace(" ", "");
text = text.toLocaleLowerCase();
text = text.trim();
text = text.trim();
return text
}
function document2vec(document) {
document = process(document).trim()
document = filter_stop_word(document)
document = document.split(' ')
let findDuplicates = arr => arr.filter((item, index) => arr.indexOf(item) != index)
let array_dup = findDuplicates(document)
let array_sentence = []
for (let i in document) {
if (array_dup.indexOf(document[i]) == -1 && document[i].length > 3 && wordVecs[document[i]] != undefined) {
array_sentence.push(wordVecs[document[i]])
}
}
return average(array_sentence, array_sentence.length, 'mashup')
}
document = document.split('\n')
let return_document_vec = {}
for (let sentence in document) {
let sen_vec = document2vec(document[sentence])
if(sen_vec != undefined){
if (sen_vec.length > 0) {
return_document_vec[document[sentence]] = sen_vec
}
}
else{
console.log('\x1b[33m','Thông báo! Không thể chuyển câu '+document[sentence]+' sang vector, rất có thể nó đã bị bộ lọc stopword và bộ lọc ký tự đặc biệt đã chặn nên các từ vựng trong câu này không xuất hiện ở bộ data vector ','\x1b[0m')
}
}
if (Object.keys(return_document_vec).length > 0) {
return_document_vec = JSON.stringify(return_document_vec)
if (url_save.length > 0) {
fs.writeFile(url_save, return_document_vec, function (err) {
if (err) { console.log(err) }
else {
console.log('Saved vecs')
}
})
} else {
return JSON.parse(return_document_vec)
}
}
}
catch(e){
console.log('\x1b[41m','Lỗi! Lỗi đường dẫn tệp, vui lòng định đường dẫn lại!','\x1b[0m')
}
}
module.exports.search_word_similarity = function (target, url_vecs_of_word, size_result) {
let search = require(path.join(__dirname, "/src/search_word_similarity.js"))
return search(target, url_vecs_of_word, size_result)
}
module.exports.knn = function (target, type_distance, data, k) {
let kdTree = require(path.join(__dirname, "/src/KD-tree.js"))
let points = []
for (let i in data) {
let item = {}
for (let y in data[i]) {
item[y] = data[i][y]
}
if (Object.keys(item).length > 0) {
points.push(item)
}
}
let search = {}
for (let i in target) {
search[i] = target[i]
}
function format_result(nearest){
let format_nearest = []
for (let item in nearest) {
let array_item = Object.values(nearest[item][0])
format_nearest.push([array_item, nearest[item][1]])
}
return format_nearest.sort(function (a, b) { return a[1] - b[1] })
}
if (points.length > 0 && Object.keys(search).length > 0) {
let dimensions = Object.keys(points[0])
if (type_distance == "eculid") {
function distance_eculid(a, b) {
let key = Object.keys(a)
let value = 0
for (let i in key) {
value += Math.pow(a[key[i]] - b[key[i]], 2)
}
return value
}
let tree_eculid = new kdTree.kdTree(points, distance_eculid, dimensions);
let nearest = tree_eculid.nearest(search, k);
return format_result(nearest)
}
if (type_distance == 'cosine') {
function L2_norm(a) {
let value = 0
for (let i in a) {
value += a[i] * a[i]
}
let sqrt_value = Math.sqrt(value)
return sqrt_value
}
function cosine_similarity(a, b) {
let key = Object.keys(a)
let value_dot = 0
for (let i in key) {
value_dot += a[key[i]] * b[key[i]]
}
return Math.abs(value_dot) / (L2_norm(a) * L2_norm(b))
}
let tree_cosin = new kdTree.kdTree(points, cosine_similarity, dimensions);
tree_cosin = tree_cosin.nearest(search, k);
return format_result(tree_cosin).reverse()
}
}
}
module.exports.VN_segmentation_tag = function (document) {
let vntk = require('vntk');
let tokenizer = vntk.wordTokenizer();
return tokenizer.tag(document);
}
module.exports.clear_sentence_VN = function(document){
function process(text) {
text = text.replace(/[’“”%&!’#√.*+?,;^${}()_`'"|[\]\\//]/g, " ");
text = text.replace(/[0-9]/g, '');
text = text.replace(/(\r\n\t|\n|\r)/gm, " ");
text = text.replace(/[=]/g, " ");
text = text.replace(/[:]/g, " ");
text = text.replace(/[-]/g, " ");
text = text.replace(/[>]/g, " ");
text = text.replace(/[<]/g, " ");
text = text.replace(/[@]/g, " ");
text = text.replace(/\s+/g, ' ')
text = text.replace(/[0-9]/g, ' ');
text = text.toLocaleLowerCase()
text = text.trim()
text = text.trim()
return text
}
let vntk = require('vntk');
let fs = require('fs')
let tokenizer = vntk.wordTokenizer();
document = process(document)
let array_token = tokenizer.tag(document);
let file_stop_word = fs.readFileSync(path.join(__dirname,"/src/stop_word_vn.txt")).toString();
file_stop_word = file_stop_word.split("\r\n")
array_token = array_token.filter(function (value, index, arr) {
return file_stop_word.includes(process(value)) <= 0;
});
let new_text = ''
for (let i in array_token) {
if (array_token[i] != '' && array_token[i].length >= 2) {
new_text += array_token[i] + ' '
}
}
return new_text.trim()
}
module.exports.clear_sentence_en = function(document){
let fs = require('fs')
function process(text) {
text = text.replace(/[’“”%&!’#√.*+?,;^${}()_`'"|[\]\\//]/g, " ");
text = text.replace(/[0-9]/g, '');
text = text.replace(/(\r\n\t|\n|\r)/gm, " ");
text = text.replace(/[=]/g, " ");
text = text.replace(/[:]/g, " ");
text = text.replace(/[-]/g, " ");
text = text.replace(/[>]/g, " ");
text = text.replace(/[<]/g, " ");
text = text.replace(/[@]/g, " ");
text = text.replace(/\s+/g, ' ')
text = text.replace(/[0-9]/g, ' ');
text = text.toLocaleLowerCase()
text = text.trim()
text = text.trim()
return text
}
let file_stop_word = fs.readFileSync(path.join(__dirname,"/src/stop_word.txt")).toString();
file_stop_word = file_stop_word.split("\r\n")
document = process(document)
function filter_stop_word(text) {
text = text.split(' ')
text = text.filter(function (value, index, arr) {
return file_stop_word.includes(process(value)) <= 0;
});
let new_text = ''
for (let i in text) {
if (text[i] != '' && text[i].length >= 2) {
new_text += text[i] + ' '
}
}
return new_text.trim()
}
return filter_stop_word(document)
}
module.exports.remove_duplicate_words = function(document){
document = document.split(' ')
document = [...new Set(document)]
let new_text = ''
for (let i in document) {
if (document[i] != '' && document[i].length >= 2) {
new_text += document[i] + ' '
}
}
return new_text.trim()
}
module.exports.fast_build_chatbot = function(text){
let chat = require('./src/Simplechatbot')
return chat.chatbot(text)
}
module.exports.sentiment_VN = function(text){
let se = require(path.join(__dirname, "/src/Check_sentiment.js"))
return se.sentiment(text)
}
module.exports.fix_telex = function(error_text){
let check_telex = require(path.join(__dirname, "/src/Check_telex.js"))
return check_telex.check_error_telex(error_text)
}
module.exports.English_or_Vietnamese = function(text){
let check = require(path.join(__dirname, "/src/Check_language.js"))
return check.check_language(text)
}