cjk-tokenizer
Version:
A CJK text tokenizer
41 lines (36 loc) • 1.09 kB
JavaScript
;
/**
* cjk-tokenizer module
* @module cjk-tokenizer
* @see module:index
*/
const fs = require('fs');
const commander = require('commander');
const pkg = require('../package.json');
const cjkTokenizer = require('../lib');
function list(val) {
return val.split(',');
}
commander
.version(pkg.version)
.arguments('<filename>')
.option('-s, --stopWords <stopWords>', 'stop words', list)
.option('-l, --languages <languages>', 'languages', list)
.action((filename) => {
let content;
try {
content = fs.readFileSync(filename, 'utf-8');
} catch (e) {
console.error('Error: unable to read ' + filename + '.');
process.exit(1);
}
const tokens = cjkTokenizer.tokenize(content, {
languages: commander.languages || null,
stopWords: commander.stopWords || [],
});
console.log(JSON.stringify(tokens, null, 2));
});
// execute client
commander.parse(process.argv);
if (process.argv.length === 2) commander.outputHelp();