UNPKG

pos2tracery

Version:

a POS-tagger to generate Tracery grammars, and a tool to merge grammars

231 lines (217 loc) 7.35 kB
/* * pos2tracery.js * Copyleft (ↄ) 2019 jkirchartz <me@jkirchartz.com> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ const utils = require('./utils'); const util = require('wink-nlp-utils'); const pos = require('wink-pos-tagger'); const contractions = require('expand-contractions'); const tagger = pos(); exports.command = 'pos <input> [output]'; exports.alias = 'p'; exports.describe = 'convert corpus to tracery with POS tags'; exports.builder = function (yargs) { yargs.usage('Usage: $0 [pos|p] <input> [output] [options]') .positional("input", { describe: "input/source file", type: "string", nargs: 1, demand: "input/source file is required" }).positional("output", { describe: "optional output/destination file, if not set file prints to stdout", type: "string", nargs: 1, }).options({ "verbose": { alias: "v", describe: "print details while processing", type: "count" }, "percent": { alias: "p", describe: "limit the percentage of words replaced with their POS tags, as number between 1 and 100", default: 100, type: "number", nargs: 1, }, "modifiers" : { alias: "m", describe: "replace english modifiers with their equivalent tracery.modifier function", default: false, type: "boolean" }, "origin": { alias: "o", describe: "Include \"origin\" key in tracery file, specify --no-origin to not add this key", type: "boolean", default: true }, "ignore" : { "alias": "i", describe: "list of parts of speech to not tagify", default: [], type: "array" }, "split" : { alias: "s", describe: "determine string splitting strategy: line, paragraph, or sentence", choices: ["l", "p", "s"], nargs: 1, default: "s" } }).coerce(['input', 'output'], require('path').resolve).help('h').alias('h', 'help'); }; exports.handler = function pos2tracery(args, type) { var tracery = { 'sentences' : []}; if (args.origin) { tracery["origin"] = ["#sentences#"]; } const log = utils.logger(args.verbose); // generate tracery from POS const parseSentence = (str) => { // clean up sentence str = str.replace(/[\r\n\s]+/g, ' '); str = str.replace(/--/g, '-'); str = contractions.expand(str); // tag sentence let taggedSentence = tagger.tagSentence(str); // fix peculiarity of wink-pos-tagger tagging sentences as NNP let scragglers = []; taggedSentence = taggedSentence.filter((obj) => { if (obj.pos.slice(0,3) === "NPP" && (obj.value.indexOf(" ") > -1 || obj.value.indexOf("\"") > -1)) { scragglers.push(tagger.tagSentence(obj.value)); return false; } return true; }); taggedSentence = taggedSentence.concat(scragglers); // munge sentences to create tracery taggedSentence = taggedSentence.filter(tagifySentence); log(2)("tagged sentence:", taggedSentence); return tidySentences(taggedSentence); }; const tidySentences = (arr) => { if (!arr || arr.length == 0) { return ""; } // reduce array of objects to one object and return it's value return arr.reduce((a, b) => { // apply proper spacing, being mindful of punctuation if ( b.tag === "punctuation" || b.pos === "POS" ) { return { value : a.value + "" + b.value }; } else { return { value : a.value + " " + b.value }; } }).value; }; let was_a_or_an = false; const tagifySentence = (obj, i, arr) => { let key = obj.pos === "." ? "ending" : obj.pos; let word = obj.normal; log(3)("found: %s (%s):", obj.value, word); if (args.ignore.indexOf(obj.pos) !== -1) { return obj; // return ignored values unscathed } else if(obj.tag === "punctuation" || obj.pos === "POS") { // escape problematic punctuation, just in case obj.value = ["'", '"', '[', ']', '#'].indexOf(obj.value) > -1 ? "\\" + obj.value : obj.value; return obj; } if ( ! tracery[key] ) { tracery[key] = []; } if (obj.lemma && args.modifiers) { if ((obj.normal.slice(-2) === 'ed' && obj.lemma.slice(-2) !== 'ed')) { word = obj.lemma; } } if ( tracery[key].indexOf(word) === -1 && (word !== "a" || word !=="an")) { tracery[key].push(word); } obj.value = utils.chance(args.percent) ? obj.value : '#' + key + '#'; if (tracery[key].indexOf(word) > -1 ) { if(args.modifiers && obj.value.indexOf('#') >= -1) { if((i === 0 && obj.pos !== "\"") || (i >= 1 && arr[i-1].pos === "\"")) { //capitalize first letter in a sentence obj.value = '#' + key + '.capitalize#'; } else if (obj.pos.slice(0,3) === "NNP") { // capitalize all proper nouns obj.value = '#' + key + '.capitalizeAll#'; } if (obj.lemma && obj.normal.slice(-2) === 'ed' && obj.lemma.slice(-2) !== 'ed') { // preserve "-ed" words obj.value = '#' + key + '.ed#'; } if (was_a_or_an) { // ensure correct "A" or "An" is used obj.value = '#' + key + '.a#'; } } log(3)("adding %s to %s (%s)", obj.value, key, obj.pos); } // don't save one-letter non-words if (obj.value.length === 1 && (word !== "a" || word !=="o" || word !== "i")) { return false; } if (word === "a" || word === "an") { was_a_or_an = true; return false; } else { was_a_or_an = false; } return obj; }; const parseCorpus = (file) => { let corpus = file.toString(); // tidy newlines corpus = corpus.replace(/\r\n/g, '\n'); var splitting = ""; switch (args.split || "s") { case "l": // line corpus = corpus.split(/\n+/); splitting = "line"; break; case "p": // paragraph corpus = corpus.split(/\n\n+/); splitting = "paragraph"; break; default: // sentence corpus = util.string.sentences(corpus.replace(/\n+/, ' ').replace(/\s+/, ' ')); splitting = "sentence"; } log(1)("splitting on %s", splitting); // parse sentences, remove one-character sentences corpus = corpus.map(parseSentence).filter((s) => s.length > 1); tracery['sentences'] = corpus; utils.write(args.output, tracery); return tracery; }; if (args.input) { var file = utils.read(args.input); if (file.length) { if(type === "module") { return parseCorpus(file); } else { parseCorpus(file); } } else { log(0)(file.error); } } };