node-tfidf
Version:
TF-IDF string for Nodejs
182 lines (148 loc) • 5.18 kB
JavaScript
var fs = require('fs');
var _ = require("underscore")._;
var Tokenizer = require('node-vntokenizer');
var tokenizer = new Tokenizer();
var stopwords_en = require('./model/stopwords-en');
var stopwords_vi = require('vietnamese-stopwords');
var stopwords = _.union(stopwords_en, stopwords_vi);
function buildDocument(text, key) {
var stopOut;
if (typeof text === 'string') {
text = tokenizer.tokenize(text.toLowerCase());
stopOut = true;
} else if (!_.isArray(text)) {
stopOut = false;
return text;
}
return text.reduce(function(document, term) {
if (typeof document[term] === 'function') document[term] = 0;
if (!stopOut || stopwords.indexOf(term) < 0)
document[term] = (document[term] ? document[term] + 1 : 1);
return document;
}, {
__key: key
});
}
function tf(term, document) {
return document[term] ? document[term] : 0;
}
function documentHasTerm(term, document) {
return document[term] && document[term] > 0;
}
function TfIdf(deserialized) {
if (deserialized)
this.documents = deserialized.documents;
else
this.documents = [];
this._idfCache = {};
}
// backwards compatibility for < node 0.10
function isEncoding(encoding) {
if (typeof Buffer.isEncoding !== 'undefined')
return Buffer.isEncoding(encoding);
switch ((encoding + '').toLowerCase()) {
case 'hex':
case 'utf8':
case 'utf-8':
case 'ascii':
case 'binary':
case 'base64':
case 'ucs2':
case 'ucs-2':
case 'utf16le':
case 'utf-16le':
case 'raw':
return true;
}
return false;
}
module.exports = TfIdf;
TfIdf.tf = tf;
TfIdf.prototype.idf = function(term, force) {
// Lookup the term in the New term-IDF caching,
// this will cut search times down exponentially on large document sets.
if (this._idfCache[term] && this._idfCache.hasOwnProperty(term) && force !== true)
return this._idfCache[term];
var docsWithTerm = this.documents.reduce(function(count, document) {
return count + (documentHasTerm(term, document) ? 1 : 0);
}, 0);
var idf = 1 + Math.log((this.documents.length) / (1 + docsWithTerm));
// Add the idf to the term cache and return it
this._idfCache[term] = idf;
return idf;
};
// If restoreCache is set to true, all terms idf scores currently cached will be recomputed.
// Otherwise, the cache will just be wiped clean
TfIdf.prototype.addDocument = function(document, key, restoreCache) {
this.documents.push(buildDocument(document, key));
// make sure the cache is invalidated when new documents arrive
if (restoreCache === true) {
for (var term in this._idfCache) {
// invoking idf with the force option set will
// force a recomputation of the idf, and it will
// automatically refresh the cache value.
this.idf(term, true);
}
} else {
this._idfCache = {};
}
};
// If restoreCache is set to true, all terms idf scores currently cached will be recomputed.
// Otherwise, the cache will just be wiped clean
TfIdf.prototype.addFileSync = function(path, encoding, key, restoreCache) {
if (!encoding)
encoding = 'utf8';
if (!isEncoding(encoding))
throw new Error('Invalid encoding: ' + encoding);
var document = fs.readFileSync(path, encoding);
this.documents.push(buildDocument(document, key));
// make sure the cache is invalidated when new documents arrive
if (restoreCache === true) {
for (var term in this._idfCache) {
// invoking idf with the force option set will
// force a recomputation of the idf, and it will
// automatically refresh the cache value.
this.idf(term, true);
}
} else {
this._idfCache = {};
}
};
TfIdf.prototype.tfidf = function(terms, d) {
var _this = this;
if (!_.isArray(terms))
terms = tokenizer.tokenize(terms.toString().toLowerCase());
return terms.reduce(function(value, term) {
var idf = _this.idf(term);
idf = idf === Infinity ? 0 : idf;
return value + (tf(term, _this.documents[d]) * idf);
}, 0.0);
};
TfIdf.prototype.listTerms = function(d) {
var terms = [];
for (var term in this.documents[d]) {
if (term != '__key')
terms.push({
term: term,
tfidf: this.tfidf(term, d)
});
}
return terms.sort(function(x, y) {
return y.tfidf - x.tfidf;
});
};
TfIdf.prototype.tfidfs = function(terms, callback) {
var tfidfs = new Array(this.documents.length);
for (var i = 0; i < this.documents.length; i++) {
tfidfs[i] = this.tfidf(terms, i);
if (callback)
callback(i, tfidfs[i], this.documents[i].__key);
}
return tfidfs;
};
// Define a tokenizer other than the default "WordTokenizer"
TfIdf.prototype.setTokenizer = function(t) {
if (!_.isFunction(t.tokenize))
throw new Error('Expected a valid Tokenizer');
tokenizer = t;
};