prose-js
Version:
Natural language-to-pseudocode compiler.
219 lines (193 loc) • 5.95 kB
JavaScript
/**
* Text-to-pseudocode translator module.
* @exports Translator
*/
var StringUtils = require('./string-utils');
function Translator() {
// Observation: Commas are taken as "less strict" end-of-statement characters,
// since commas may also be used when enumerating list elements.
// Because of this, statement separation is done twice,
// the first time taking in consideration only "stricter" characters like ";" and ".",
// and the second time taking in consideration also commas (when not escaped).
var stringUtils = new StringUtils();
var tagger = stringUtils.getTagger();
var STATEMENT_SEPARATOR = '\n';
var TAB = '\t';
var STRICT_END_OF_STATEMENT = {
text: [/\s*[\.;]+\s*/ig],
code: STATEMENT_SEPARATOR
};
var END_OF_STATEMENT = {
// text: [/\s*[\.,;]+\s*/g],
text: [/(?:[^\\])(\s*(?:, +y|[\.,;])+\s*)/i],
code: STATEMENT_SEPARATOR
};
var BLOCK = {
text: [
// /(?:mientras|por|para|si|sino)(?:[^,]+)\s*,\s*(.+)\s*(?=[,\.])/g,
/:\s*(?!(?:\n|$))(.+)\s*\s*(?=(?:\n|$))/ig
],
code:
'{'
+ STATEMENT_SEPARATOR
+ TAB + '$1'
+ STATEMENT_SEPARATOR
+'}'
};
var IF_ELSE_BLOCK = {
text: [
/((?:si)(?:[^,:]+))\s*(?:,|:)\s*([^\.]+)(?:[\.,])\s*(sino)(?:[:,]*)([^\.]+)(?=\.|$)/ig
],
code:
'$1 {'
+ STATEMENT_SEPARATOR
+ TAB + '$2'
+ STATEMENT_SEPARATOR
+ '} $3 {'
+ STATEMENT_SEPARATOR
+ TAB + '$4'
+ STATEMENT_SEPARATOR
+ '}'
};
var CONDITIONAL_BLOCK = {
text: [
/((?:mientras|por|para|si|sino)(?:[^,:]+))\s*(?:,|:)\s*(.+)\s*(?=[,\.])/ig
],
code:
'$1 {'
+ STATEMENT_SEPARATOR
+ TAB + '$2'
+ STATEMENT_SEPARATOR
+ '}'
};
var JOINED_BLOCKS = {
text: [
/\s*}\s*{\s*/ig
],
code: STATEMENT_SEPARATOR
};
var ESCAPED_CHARACTERS = {
text: [
/(?:\\)([\.,;])/ig
],
code: '$1'
};
var END_OF_STATEMENT_PATTERN = /(?:[^\\])(\s*[\.,;]+\s*)/i;
var FOR_EACH_PATTERN = /(?!(?:\s+|}|$))(por|para) +cada +(.+) +en +(.+)(?=(?:\s+|{|^))/i;
var LIST_PATTERN = /(?!(?:\s+|}|$))([\w ]+) +contiene +(?:a +)?([^\.\n]+)(?=\.|$)/i;
var WORD_REGEX = /^[A-Za-z\u00E0-\u00FC]+$/;
/*
* Returns the text translated to pseudocode.
*
* @param {string} text - Text to translate.
*/
this.translate = function(text) {
text = replace(text, BLOCK);
text = replace(text, IF_ELSE_BLOCK);
text = replace(text, CONDITIONAL_BLOCK);
text = replace(text, JOINED_BLOCKS);
text = replace(text, STRICT_END_OF_STATEMENT);
text = translateStatements(text);
text = replace(text, END_OF_STATEMENT, true);
text = replace(text, ESCAPED_CHARACTERS);
text = replaceFunctions(text);
return text;
}
function replace(string, replaceParams, isGrouped) {
for (var i = 0; i < replaceParams.text.length; i++) {
while (replaceParams.text[i].test(string)) {
if (isGrouped) {
var matches = null;
while (!matches) {
matches = replaceParams.text[i].exec(string); // Special thanks to the masterminds behind JS regular expressions
}
string = string.replace(new RegExp(escapeRegExp(matches[1]), 'g'), replaceParams.code);
} else {
string = string.replace(replaceParams.text[i], replaceParams.code);
}
}
}
return string;
}
function translateStatements(block) {
var translated = '';
var statements = block.split(STATEMENT_SEPARATOR);
for (var i = 0; i < statements.length; i++) {
if (i > 0) {
translated += STATEMENT_SEPARATOR;
}
translated += translateStatement(statements[i]);
}
return translated;
}
function translateStatement(statement) {
if (FOR_EACH_PATTERN.test(statement)) {
var matches = FOR_EACH_PATTERN.exec(statement);
statement = statement
.replace(new RegExp(escapeRegExp(matches[1]), 'g'), stringUtils.toCamelCase(matches[1]))
.replace(new RegExp(escapeRegExp(matches[2]), 'g'), stringUtils.toCamelCase(matches[2]));
}
if (LIST_PATTERN.test(statement)) {
var matches = LIST_PATTERN.exec(statement);
var listName = stringUtils.toCamelCase(matches[1]);
var listElements = matches[2]
.replace(/ +[ye] +/g, ', ') // Replace ' y ' with a separation comma.
.replace(/,/g, '\\,'); // Escape commas to prevent them from being detected as END OF STATEMENT.
statement = listName + ' = [' + listElements + ']';
}
// while (END_OF_STATEMENT_PATTERN.test(statement)) {
// var matches = END_OF_STATEMENT_PATTERN.exec(statement);
// statement = statement.replace(new RegExp(matches[1], 'g'), STATEMENT_SEPARATOR);
// }
return statement;
}
function escapeRegExp(str) {
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
function replaceFunctions(string) {
// Any verbs other than TO BE are considered function calls
if (typeof string === 'string') {
var strings = stringUtils.split(string);
} else {
var strings = string;
}
var tags = tagger.tag(strings);
for (var i = 0; i < strings.length; i++) {
if (
tags[i] === 'VERB'
&& strings[i].trim() !== ''
&& strings[i] !== 'es'
&& WORD_REGEX.test(strings[i])
) {
// Once a call is found, sorround parameters with parenthesis.
var fun = strings[i];
var params = [];
for (var j = i + 1; j < strings.length; j++) {
if (
tags[j] === '.'
|| tags[j] === 'CONJ'
|| strings[j].trim() === ''
) {
break;
}
params.push(strings[j]);
// If more than one parameter and separated by a conjunction,
// replace conjunction with a comma.
// if (tags[j] === 'CONJ') {
// params.push(',');
// } else {
// params.push(strings[j]);
// }
}
var finalStrings = strings.slice(0, i);
finalStrings.push(fun.toLowerCase());
finalStrings.push('(');
finalStrings = finalStrings.concat(params);
finalStrings.push(')');
return stringUtils.join(finalStrings) + ' ' + replaceFunctions(strings.slice(j));
}
}
return stringUtils.join(strings);
}
}
module.exports = Translator;