UNPKG

word-models

Version:

node.js interface to the Google word2vec tool

546 lines (486 loc) 11.7 kB
'use strict'; // MODULES // var fs = require( 'fs' ); var readline = require( 'readline' ); var Stream = require( 'stream' ); var mime = require( 'mime' ); var _ = require( 'underscore' ); var path = require( 'path' ); var WordVec = require( './WordVector' ); // LOAD MODEL // module.exports = function loadModel( file, callback ) { var model = {}; var vocab = []; var words; var size; var err = null; var N; // number of closest words that will be shown if ( typeof file !== 'string' ) { throw new TypeError( 'Function expects file name as its first parameter.' ); } file = path.resolve( process.cwd(), file ); if ( callback === undefined ) { callback = function(){}; } model.getVector = function ( word ) { for ( var i = 0; i < words; i++ ) { if ( vocab[i].word === word ) { return vocab[i]; } } return null; }; model.getVectors = function( words ) { if ( !words ) { return vocab; } else { return vocab.filter( function onElement( w ) { return _.contains( words, w.word ); }); } }; model.similarity = function similarity( word1, word2 ) { var vecs = []; var sum; var i; for ( i = 0; i < words; i++ ) { if ( vocab[i].word === word1 || vocab[i].word === word2 ) { vecs.push(vocab[i].values); } } if ( vecs.length === 2 ) { sum = 0; for ( i = 0; i < size; i++ ) { sum += vecs[0][i] * vecs[1][i]; } return sum; } else if ( vecs.length === 1 && word1 === word2 ) { // Case: word1 and word2 are identical: return 1.0; } // Case: At least one of the words is not available in vocabulary: return null; }; function normalize( values ) { var a; var vec = values; var size = values.length; var len = 0; for ( a = 0; a < size; a++ ) { len += vec[a] * vec[a]; } len = Math.sqrt( len ); for ( a = 0; a < size; a++ ) { vec[a] /= len; } return vec; } model.getNearestWord = function getNearestWord( vec ) { var bestw; var bestd; var c; var a; if ( vec instanceof WordVec === true ) { vec = vec.values; } vec = normalize( vec ); for ( c = 0; c < words; c++) { var dist = 0; for ( a = 0; a < size; a++ ) { dist += vec[a] * vocab[c].values[a]; } if ( c === 0 || dist > bestd ) { bestd = dist; bestw = vocab[c].word; } } var o = {}; o.word = bestw; o.dist = bestd; return o; }; model.getNearestWords = function getNearestWords( vec, N_input ) { var bestd; var bestw; var dist; var ret; var d; var i; var c; var a; N = N_input || 10; if ( vec instanceof WordVec === true ) { vec = vec.values; } vec = normalize( vec ); bestw = new Array(N); bestd = Array.apply( null, new Array(N) ).map( Number.prototype.valueOf, -1 ); for ( c = 0; c < words; c++ ) { dist = 0; for ( a = 0; a < size; a++ ) { dist += vec[a] * vocab[c].values[a]; } for ( a = 0; a < N; a++ ) { if ( dist > bestd[a] ) { for ( d = N - 1; d > a; d-- ) { bestd[d] = bestd[d - 1]; bestw[d] = bestw[d - 1]; } bestd[a] = dist; bestw[a] = vocab[c].word; break; } } } ret = []; for ( i = 0; i < N; i++ ) { var o = {}; o.word = bestw[i]; o.dist = bestd[i]; ret[i] = o; } return ret; }; model.mostSimilar = function mostSimilar( input_phrase, N_input ) { var phrase_words; var phrase; var bestw; var bestd; var found; var dist; var vec; var len; var cn; var a; var b; var c; var i; var d; var o; N = N_input || 40; phrase = { words: [], output: {} }; phrase_words = input_phrase.split( ' ' ); for ( i = 0; i < phrase_words.length; i++ ) { o = { word: phrase_words[ i ], pos: -1 }; phrase.words.push( o ); } bestw = new Array( N ); bestd = Array.apply( null, new Array(N) ).map( Number.prototype.valueOf, -1 ); cn = phrase.words.length; // Boolean checking whether at least one phrase word is in dictionary... found = false; for ( a = 0; a < cn; a++ ) { for ( b = 0; b < words; b++ ) { if ( phrase.words[a].word === vocab[b].word ) { found = true; phrase.words[a].pos = b; break; } } if ( phrase.words[a].pos === -1 ) { console.log( 'Out of dictionary word: ' + phrase.words[a].word + '\n' ); } } if ( found === false ) { // All words are out-of-dictionary, return `null`: return null; } vec = []; for ( i = 0; i < size; i++ ) { vec[i] = 0; } for ( b = 0; b < cn; b++ ) { if ( phrase.words[b].pos !== -1 ) { for ( a = 0; a < size; a++ ) { vec[a] += vocab[ phrase.words[b].pos ].values[a]; } } } // Normalizing vector vec... len = 0; for ( a = 0; a < size; a++ ) { len += vec[a] * vec[a]; } len = Math.sqrt( len ); for ( a = 0; a < size; a++ ) { vec[a] = vec[a] / len; } // Iterate through vocabulary... for ( c = 0; c < words; c++ ) { a = 0; for ( b = 0; b < cn; b++ ) { if ( phrase.words[b].pos === c ) { a = 1; } } if ( a !== 1 ){ dist = 0; for ( a = 0; a < size; a++ ) { dist += vec[a] * vocab[c].values[a]; } for ( a = 0; a < N; a++ ) { if ( dist > bestd[a] ) { for ( d = N - 1; d > a; d-- ) { bestd[d] = bestd[d - 1]; bestw[d] = bestw[d - 1]; } bestd[a] = dist; bestw[a] = vocab[c].word; break; } } } } var ret = []; for ( i = 0; i < N; i++ ) { o = {}; o.word = bestw[i]; o.dist = bestd[i]; ret[i] = o; } return ret; }; model.analogy = function analogy( word, pair, N_input ) { var phrase; var bestw; var bestd; var ret; var vec; var bi; var cn; var a; var b; var d; var i; var o; N = N_input || 40; if ( _.isString(word) === false ) { throw new TypeError( 'Word of interest has to be supplied as string.' ); } if ( _.isArray(pair) === false ) { throw new TypeError( 'Word pair has to be supplied in string Array.' ); } phrase = { words: pair, output: {} }; phrase.words.push( word ); phrase.words = phrase.words.map( function(word) { o = {}; o.word = word; o.pos = -1; return o; }); bestw = new Array( N ); bestd = Array.apply(null, new Array(N)).map(Number.prototype.valueOf, 0); cn = phrase.words.length; bi = phrase.words; vec = Array.apply(null, new Array(size)).map(Number.prototype.valueOf, 0); for ( a = 0; a < cn; a++ ) { for ( b = 0; b < words; b++ ) { if ( phrase.words[a].word === vocab[b].word ) { phrase.words[a].pos = b; break; } } if ( phrase.words[a].pos === -1 ) { console.log( 'Out of dictionary word: ' + phrase.words[a].word + '\n' ); break; } } for ( b = 0; b < cn; b++ ) { if ( phrase.words[b].pos !== -1 ) { for ( a = 0; a < size; a++ ) { vec[a] += vocab[phrase.words[b].pos].values[a]; } } } for ( a = 0; a < size; a++ ) { vec[a] = vocab[bi[1].pos].values[a] - vocab[bi[0].pos].values[a] + vocab[bi[2].pos].values[a]; } var len = 0; for ( a = 0; a < size; a++ ) { len += vec[a] * vec[a]; } len = Math.sqrt( len ); for ( a = 0; a < size; a++ ) { vec[a] /= len; } for ( var c = 0; c < words; c++ ) { if ( c === bi[0].pos ) { continue; } if ( c === bi[1].pos ) { continue; } if ( c === bi[2].pos ) { continue; } a = 0; for ( b = 0; b < cn; b++ ) { if ( bi[b].pos === c ) { a = 1; } } if ( a === 1 ) { continue; } var dist = 0; for ( a = 0; a < size; a++ ) { dist += vec[a] * vocab[c].values[a]; } for ( a = 0; a < N; a++ ) { if ( dist > bestd[a] ) { for ( d = N - 1; d > a; d-- ) { bestd[d] = bestd[d - 1]; bestw[d] = bestw[d - 1]; } bestd[a] = dist; bestw[a] = vocab[c].word; break; } } } ret = []; for( i = 0; i < N; i++ ){ o = {}; o.word = bestw[i]; o.dist = bestd[i]; ret[ i ] = o; } return ret; }; function init( file ) { var mime_type = mime.getType( file ); switch ( mime_type ) { case 'text/plain': readTxt( file ); break; case 'application/octet-stream': readBinary( file ); break; } } init( file ); function readTxt( file ) { var instream = fs.createReadStream( file ); var outstream = new Stream(); var rl = readline.createInterface( instream, outstream ); var counter = 0; rl.on( 'line', function( line ) { var values; var word; var arr; var len; var val; var a; var i; var o; if ( counter === 0 ) { arr = line.split( ' ' ); words = arr[0]; size = arr[1]; if( isNaN(words) || isNaN(size) ) { throw new Error( 'First line of text file has to be <number of words> <length of vector>.' ); } } else { arr = line.split( ' ' ); word = arr.shift( 1 ); values = []; for ( i = 0; i < arr.length; i++ ) { val = arr[ i ]; if ( val !== '' ) { values.push( parseFloat( val ) ); } } o = new WordVec( word, values ); len = 0; for ( a = 0; a < size; a++ ) { len += o.values[a] * o.values[a]; } len = Math.sqrt(len); for (a = 0; a < size; a++) { o.values[a] /= len; } vocab.push( o ); } counter++; }); rl.on( 'close', function() { model.words = words; model.size = size; callback( err, model ); }); } function readBinary( file ) { var MAX_STRING = 100; fs.open( file, 'r', function( status, fd ) { // Need enough data to cover the string for two long longs... var buffer = new Buffer( 50 ); var pos = 0; fs.read( fd, buffer, 0, 50, 0, function( err, bytesRead, buffer ) { var arr; if ( err ) { callback( err, null ); return; } arr = buffer.toString().split(/[ \n]/); words = parseInt( arr[0], 10 ); size = parseInt( arr[1], 10 ); pos += arr[0].length; pos += arr[1].length; pos += 2; // whitespaces // maximum length of a vector is 100 // and size*4 bytes for the floats buffer = new Buffer( MAX_STRING + size * 4 ); fs.read( fd, buffer, 0, MAX_STRING + size * 4, pos, function readOne( err, bytesRead, buffer ) { var values; var word; var len; var off; var a; var i; var o; if ( err ) { callback( err, null ); return; } if ( bytesRead < size * 4 ) { model.words = words; model.size = size; callback( err, model ); return; } word = buffer .toString() .split(' ')[0]; pos += Buffer.from(word).byteLength + 1; pos += size * 4; off = Buffer.from(word).byteLength + 1; values = new Float32Array( size ); for ( i = 0; i < size; i++ ) { values[ i ] = buffer.readFloatLE( off ); off += 4; } o = new WordVec( word.trim(), values ); len = 0; for ( a = 0; a < size; a++ ) { len += o.values[a] * o.values[a]; } len = Math.sqrt( len ); for ( a = 0; a < size; a++ ) { o.values[a] /= len; } vocab.push( o ); fs.read( fd, buffer, 0, MAX_STRING + size * 4, pos, readOne ); }); }); }); } };