UNPKG

node-tfidf

Version:
182 lines (148 loc) 5.18 kB
var fs = require('fs'); var _ = require("underscore")._; var Tokenizer = require('node-vntokenizer'); var tokenizer = new Tokenizer(); var stopwords_en = require('./model/stopwords-en'); var stopwords_vi = require('vietnamese-stopwords'); var stopwords = _.union(stopwords_en, stopwords_vi); function buildDocument(text, key) { var stopOut; if (typeof text === 'string') { text = tokenizer.tokenize(text.toLowerCase()); stopOut = true; } else if (!_.isArray(text)) { stopOut = false; return text; } return text.reduce(function(document, term) { if (typeof document[term] === 'function') document[term] = 0; if (!stopOut || stopwords.indexOf(term) < 0) document[term] = (document[term] ? document[term] + 1 : 1); return document; }, { __key: key }); } function tf(term, document) { return document[term] ? document[term] : 0; } function documentHasTerm(term, document) { return document[term] && document[term] > 0; } function TfIdf(deserialized) { if (deserialized) this.documents = deserialized.documents; else this.documents = []; this._idfCache = {}; } // backwards compatibility for < node 0.10 function isEncoding(encoding) { if (typeof Buffer.isEncoding !== 'undefined') return Buffer.isEncoding(encoding); switch ((encoding + '').toLowerCase()) { case 'hex': case 'utf8': case 'utf-8': case 'ascii': case 'binary': case 'base64': case 'ucs2': case 'ucs-2': case 'utf16le': case 'utf-16le': case 'raw': return true; } return false; } module.exports = TfIdf; TfIdf.tf = tf; TfIdf.prototype.idf = function(term, force) { // Lookup the term in the New term-IDF caching, // this will cut search times down exponentially on large document sets. if (this._idfCache[term] && this._idfCache.hasOwnProperty(term) && force !== true) return this._idfCache[term]; var docsWithTerm = this.documents.reduce(function(count, document) { return count + (documentHasTerm(term, document) ? 1 : 0); }, 0); var idf = 1 + Math.log((this.documents.length) / (1 + docsWithTerm)); // Add the idf to the term cache and return it this._idfCache[term] = idf; return idf; }; // If restoreCache is set to true, all terms idf scores currently cached will be recomputed. // Otherwise, the cache will just be wiped clean TfIdf.prototype.addDocument = function(document, key, restoreCache) { this.documents.push(buildDocument(document, key)); // make sure the cache is invalidated when new documents arrive if (restoreCache === true) { for (var term in this._idfCache) { // invoking idf with the force option set will // force a recomputation of the idf, and it will // automatically refresh the cache value. this.idf(term, true); } } else { this._idfCache = {}; } }; // If restoreCache is set to true, all terms idf scores currently cached will be recomputed. // Otherwise, the cache will just be wiped clean TfIdf.prototype.addFileSync = function(path, encoding, key, restoreCache) { if (!encoding) encoding = 'utf8'; if (!isEncoding(encoding)) throw new Error('Invalid encoding: ' + encoding); var document = fs.readFileSync(path, encoding); this.documents.push(buildDocument(document, key)); // make sure the cache is invalidated when new documents arrive if (restoreCache === true) { for (var term in this._idfCache) { // invoking idf with the force option set will // force a recomputation of the idf, and it will // automatically refresh the cache value. this.idf(term, true); } } else { this._idfCache = {}; } }; TfIdf.prototype.tfidf = function(terms, d) { var _this = this; if (!_.isArray(terms)) terms = tokenizer.tokenize(terms.toString().toLowerCase()); return terms.reduce(function(value, term) { var idf = _this.idf(term); idf = idf === Infinity ? 0 : idf; return value + (tf(term, _this.documents[d]) * idf); }, 0.0); }; TfIdf.prototype.listTerms = function(d) { var terms = []; for (var term in this.documents[d]) { if (term != '__key') terms.push({ term: term, tfidf: this.tfidf(term, d) }); } return terms.sort(function(x, y) { return y.tfidf - x.tfidf; }); }; TfIdf.prototype.tfidfs = function(terms, callback) { var tfidfs = new Array(this.documents.length); for (var i = 0; i < this.documents.length; i++) { tfidfs[i] = this.tfidf(terms, i); if (callback) callback(i, tfidfs[i], this.documents[i].__key); } return tfidfs; }; // Define a tokenizer other than the default "WordTokenizer" TfIdf.prototype.setTokenizer = function(t) { if (!_.isFunction(t.tokenize)) throw new Error('Expected a valid Tokenizer'); tokenizer = t; };