UNPKG

word-models

Version:

node.js interface to the Google word2vec tool

225 lines (188 loc) 6.07 kB
'use strict'; // MODULES // var spawn = require( 'child_process' ).spawn; var path = require( 'path' ); var _ = require( 'underscore' ); var changeCase = require( 'change-case' ); var loadModel = require( './model' ); var WordVector = require( './WordVector' ); // VARIABLES // var PACKAGE_FOLDER = path.join( __dirname, '..' ); var SRC_FOLDER = PACKAGE_FOLDER + '/src/'; // NAMESPACE // var ns = {}; /* Parameters for training: -train <file> Use text data from <file> to train the model -output <file> Use <file> to save the resulting word vectors / word clusters -size <int> Set size of word vectors; default is 100 -window <int> Set max skip length between words; default is 5 -sample <float> Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5) -hs <int> Use Hierarchical Softmax; default is 0 (not used) -negative <int> Number of negative examples; default is 5, common values are 3 - 10 (0 = not used) -threads <int> Use <int> threads (default 12) -iter <int> Run more training iterations (default 5) -min-count <int> This will discard words that appear less than <int> times; default is 5 -alpha <float> Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW -classes <int> Output word classes rather than word vectors; default number of classes is 0 (vectors are written) -debug <int> Set the debug mode (default = 2 = more info during training) -binary <int> Save the resulting vectors in binary moded; default is 0 (off) -save-vocab <file> The vocabulary will be saved to <file> -read-vocab <file> The vocabulary will be read from <file>, not constructed from the training data -cbow <int> Use the continuous bag of words model; default is 1 (use 0 for skip-gram model) */ function word2vec( input, output, params, callback ) { var paramsArr; var inputPath; var cc_name; var silent = false; var name; var cwd; if ( params === undefined ) { params = {}; } if ( callback === undefined ) { callback = function(){}; } if ( params.hasOwnProperty( 'silent' ) === true ) { silent = params.silent; } if ( !silent ) { var charm = require('charm')(); charm.pipe(process.stdout); charm.reset(); } paramsArr = []; for ( name in params ) { if ( params.hasOwnProperty(name) === true && name !== 'silent' ) { cc_name = changeCase.param( name ); paramsArr.push( '-' + cc_name ); paramsArr.push( params[name] ); } } console.log('input is String ?', _.isString(input)) console.log('output is String ?', _.isString(output)) if( _.isString(input) === false || _.isString(output) === false ) { throw new TypeError( 'Input and output file names must be strings' ); } cwd = process.cwd(); inputPath = path.resolve( cwd, input ); input = path.relative( SRC_FOLDER, inputPath ); output = path.relative( SRC_FOLDER, output ); var word2vecProc = spawn( './word2vec', ['-train', input, '-output', output ].concat(paramsArr), { cwd: SRC_FOLDER } ); word2vecProc.stdout.on( 'data', function onData( data ) { if ( !silent ) { charm.foreground( 'cyan' ); charm.write( data ); } }); word2vecProc.on( 'close', function onClose( code ) { if ( !silent ) { charm.write( '\n' ); charm.foreground( 'yellow' ); charm.write( 'Child process exited with code: ' + code ); charm.write( '\n' ); charm.display( 'reset' ); } callback( code ); }); } // end FUNCTION word2vec() /* Options: Parameters for training: -train <file> Use text data from <file> to train the model -output <file> Use <file> to save the resulting word vectors / word clusters / phrases -min-count <int> This will discard words that appear less than <int> times; default is 5 -threshold <float> The <float> value represents threshold for forming the phrases (higher means less phrases); default 100 -debug <int> Set the debug mode (default = 2 = more info during training) Examples: ./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2 */ function word2phrase( input, output, params, callback ) { var inputPath; var paramsArr; var cc_name; var silent = false; var charm; var name; var cwd; if ( params === undefined ) { params = {}; } if ( callback === undefined ) { callback = function(){}; } if ( params.hasOwnProperty( 'silent' ) === true ) { silent = params.silent; } if ( !silent ) { charm = require( 'charm' )(); charm.pipe( process.stdout ); charm.reset(); } paramsArr = []; for ( name in params ){ if ( params.hasOwnProperty( params ) === true ) { cc_name = changeCase.param( name ); paramsArr.push( '-' + cc_name ); paramsArr.push( params[name] ); } } if ( _.isString(input) === false || _.isString(output) === false ) { throw new TypeError( 'Input and output files have to be supplied as strings' ); } cwd = process.cwd(); inputPath = path.resolve( cwd, input ); input = path.relative( SRC_FOLDER, inputPath ); output = path.relative( SRC_FOLDER, output ); var word2phraseProc = spawn( './word2phrase', [ '-train', input, '-output', output ].concat(paramsArr), { cwd: SRC_FOLDER } ); word2phraseProc.stdout.on( 'data', function onData(data) { if ( !silent ) { charm.foreground( 'cyan' ); charm.write( data ); } }); word2phraseProc.on( 'close', function onClose( code ) { if ( !silent ) { charm.write( '\n' ); charm.foreground( 'yellow' ); charm.write( 'Child process exited with code ' + code ); charm.write( '\n' ); charm.display( 'reset' ); } callback( code ); }); } // end FUNCTION word2phrase() ns.word2vec = word2vec; ns.word2phrase = word2phrase; ns.loadModel = loadModel; ns.WordVector = WordVector; // EXPORTS // module.exports = ns;