UNPKG

mmir-lib

Version:

MMIR (Mobile Multimodal Interaction and Relay) library

github.com/mmig/mmir-lib

692 lines (603 loc) • 22.2 kB

JavaScript

define(['mmirf/pegjs','mmirf/configurationManager','mmirf/grammarConverter','mmirf/baseGen','mmirf/util/deferred','mmirf/util/extend','mmirf/util/toArray','mmirf/logger', 'module'], /** * Generator for executable language-grammars (i.e. converted JSON grammars). * * <p> * This generator uses PEG.js for compiling the JSON grammar. * * <p> * The generator for compiling the JSON grammar definitions in <code>www/config/languages/<language code>/grammar.json</code> * can be configured in <code>www/config/configuration.json</code>:<br> * <pre> * { * ... * "grammarCompiler": "pegjs", * ... * }</pre> * * <p> * PEGjs supports grammar generation for: * PEG (Parsing Expression Grammar) * * NOTE: PEG is a different formalism than "classical" context-free grammar definitions/formalisms; * see also <a href="http://en.wikipedia.org/wiki/Parsing_expression_grammar">explanation of PEG in Wikipedia</a> * * @see PEGjs homepage at <a href="http://pegjs.majda.cz/">http://pegjs.majda.cz/</a> * * @class * @constant * @public * @name PegJsGenerator * @memberOf mmir.env.grammar * @hideconstructor * * @requires PEG.js */ function(pegjs, configManager, GrammarConverter, BaseGenerator, deferred, extend, toArray, Logger, module){ /** * Deferred object that will be returned - for async-initialization: * the deferred object will be resolved, when this module has been initialized. * * @private * @type Deferred * @memberOf PegJsGenerator# */ var deferred = deferred(); //no async initialization necessary for PEG.js generator -> resolve immediately deferred.resolve(); /** * The Logger for the PEGjs generator. * * @private * @type mmir.tools.Logger * @memberOf PegJsGenerator# * * @see mmir.Logging */ var logger = Logger.create(module); /** * The default options for the PEGjs compiler. * * To overwrite the default options, configure the following property in <code>www/config/configuration.json</code>:<br> * <pre> * { * ... * "grammar": { * ... * "pegjs": { * "cache": [true | false], // "If true, makes the parser cache results, avoiding exponential parsing time in pathological cases but making the parser slower" - DEFAULT false * "optimize": ["speed" | "size"], //optimizing the generated parser for speed or (code) size - DEFAULT "speed" * "output": ["source" | "parser"], //should not be changed!!! whether to return TEXT or evaluated JavaScript - DEFAULT: "source" * "allowedStartRules": RULE_NAMES //should not be changed!!! - DEFAULT: not set * } * ... * }, * ... * }</pre> * * non-specific compiler options: * <code>execMode = 'sync' | 'async'</code> * <code>genSourceUrl = true | STRING | FALSY'</code> * * * @constant * @private * @default cache := false, optimize := 'speed', output := 'source', allowedStartRules := undefined * @memberOf PegJsGenerator# */ var DEFAULT_OPTIONS = { cache: false, optimize: "speed", output: "source", // allowedStartRules: void(0), FIXME DISABLED: pegjs actually evaluates this value, if it present (even if it is undefined/FALSY) execMode: 'sync',//'sync' | 'async' | default: sync genSourceUrl: '',// true | STRING: the sourceURL for eval'ed parser-module | default: FALSY }; /** * the ID for the grammar engine * @constant * @private * @memberOf PegJsGenerator# */ var engineId = 'pegjs'; /** * Name for this plugin/grammar-generator (e.g. used for looking up configuration values in configuration.json). * @constant * @private * @memberOf PegJsGenerator# */ var pluginName = 'grammar.'+engineId; /** * instance of a BaseGenerator (provides common resources for generating grammar definitions). * @constant * @private * @type BaseGenerator * @memberOf PegJsGenerator# */ var baseGenerator = new BaseGenerator(logger, engineId); /** * Exported (public) functions for the PEGjs grammar-engine. * @public * @type GrammarGenerator * @memberOf PegJsGenerator# */ var pegjsGen = { /** @scope PegJsGenerator.prototype */ /** * The name/ID for the compile engine for the PEG.js compiler * * @memberOf mmir.env.grammar.PegJsGenerator.prototype */ engineId: engineId, /** * @param {Function} [callback] OPTIONAL * the callback that is triggered, when the engine is initialized * @returns {Deferred} * a promise that is resolved, when the engine is initialized * (NOTE: if you use the same function for the <code>callback</code> AND the promise, * then the function will be invoked twice!) * * @memberOf mmir.env.grammar.PegJsGenerator.prototype */ init: function(callback){ if(callback){ deferred.then(callback, callback); } return deferred; }, /** @returns {Boolean} if this engine compilation works asynchronously. The current implementation works synchronously (returns FALSE) */ isAsyncCompilation: function(){ return false; }, /** * The function for compiling a JSON grammar: * * * @param {mmir.grammar.GrammarConverter} theConverterInstance * @param {String} instanceId * the ID for the compiled grammar (usually this is a language code) * @param {Number|GrammarCompileOption} fileFormatVersionOrOptions * the version of the file format (this is a constant within {@link mmir.SemanticInterpreter#getFileVersion}, * or an compile options object, see {@link mmir.env.grammar.BaseGenerator#toGrammarCompileOptions} * @param callback * @returns {mmir.grammar.GrammarConverter} * the grammar instance with attached with the compiled function for executing the * grammar to the instance's {@link GrammarConvert#executeGrammar} property/function. */ compileGrammar: function(theConverterInstance, instanceId, fileFormatVersionOrOptions, callback){ //attach functions for PEG.js conversion/generation to the converter-instance: extend(theConverterInstance, baseGenerator, PegJsGrammarConverterExt); //start conversion: create grammar in PEG.js syntax (from the JSON definition): theConverterInstance.init(); this._preparePrintError(); theConverterInstance.convertJSONGrammar(); var grammarDefinition = theConverterInstance.getGrammarDef(); //load options from configuration: var config = configManager.get(pluginName, {}); //combine with default default options: var options = baseGenerator.toGrammarCompileOptions(instanceId, DEFAULT_OPTIONS, config, fileFormatVersionOrOptions); var compileParserModule = function(grammarParser, hasError){ var addGrammarParserExec = theConverterInstance.getCodeWrapPrefix(options.fileVersion, JSON.stringify(options.execMode), !options.strict) + 'var parser = ' + grammarParser + ';\n' + 'function _printLog(){console.log.apply(console, arguments);};\n' + 'function _noopFunc(){};\n' + 'var _logDebug = _noopFunc;\n' + 'var grammarFunc = function(inputStr, options){\n' // + ' options = options || {debug: true, trace: function(msg){window.alert(msg)}};\n' //TEST + ' _logDebug = options && options.debug? _printLog : _noopFunc;\n' + ' var result; try {\n' + ' result = parser.parse.call(this, inputStr, options);\n' + ' } catch (err){\n' + ' result = {error: err, phrase: inputStr, engine: "pegjs"};\n'//TODO warning/error messaging? -> need to handle encoded chars, if error message should be meaningful + ' }\n' + ' return result;\n' + '};\n' + theConverterInstance.getCodeWrapSuffix(theConverterInstance.getEncodedStopwords(), 'grammarFunc', instanceId); if(options.genSourceUrl){ var sourceUrlStr; if(options.genSourceUrl === true){ sourceUrlStr = 'gen/grammar/_compiled_grammar_'+instanceId; } else { sourceUrlStr = options.genSourceUrl.toString().replace(/<id>/g,instanceId); } //for Chrome / FireFox debugging: provide an URL for eval'ed code addGrammarParserExec += '//@ sourceURL='+sourceUrlStr+'\n' +'//# sourceURL='+sourceUrlStr+'\n'; } theConverterInstance.setGrammarSource(addGrammarParserExec); try{ eval(addGrammarParserExec); } catch (err) { //TODO russa: generate meaningful error message with details about error location // eg. use esprima (http://esprima.org) ...? // ... as optional dependency (see deferred initialization above?) var evalMsg = 'Error during eval() for "'+ instanceId +'": ' + err; if(pegjs.printError){ pegjs.printError(evalMsg); } else { logger.error('PEGjs', 'evalCompiled', evalMsg, err); } if(! hasError){ evalMsg = '[INVALID GRAMMAR JavaScript CODE] ' + evalMsg; var parseDummyFunc = (function(msg, error){ return function(){ console.error(msg); console.error(error); throw msg;}; })(evalMsg, err); parseDummyFunc.hasErrors = true; theConverterInstance.setGrammarFunction(parseDummyFunc); } } //invoke callback if present: if(callback){ callback(theConverterInstance); } }; var isPreventDefault = this._afterCompileParser(compileParserModule, callback); var result = this._compileParser(grammarDefinition, options, isPreventDefault); if(!isPreventDefault){ var hasError = result.hasError; compileParserModule(result.def, hasError); } return theConverterInstance; }, /** * @protected */ _compileParser: function(grammarDefinition, options, afterCompileParserResult){ var hasError = false; var grammarParser; try{ grammarParser = pegjs.generate(grammarDefinition, options); } catch(error) { // "{ // "message": "Expected \"=\" or string but \"_\" found.", // "expected": [ // { // "type": "literal", // "value": "=", // "description": "\"=\"" // }, // { // "type": "other", // "description": "string" // } // ], // "found": "_", // "location": { // "start": { // "offset": 1233, // "line": 26, // "column": 5 // }, // "end": { // "offset": 1234, // "line": 26, // "column": 6 // } // }, // "name": "SyntaxError" // }" var msg = ' while compiling grammar "' + options.id+ '": '; if(error.name === 'SyntaxError'){ msg= 'SyntaxError' + msg + error.message; } else { msg = 'Error' + msg + (error && error.stack? error.stack : error); } var loc = (error.location && (error.location.start || error.location.end)) || error; if(typeof loc.line !== 'undefined'){ msg += ' at line '+loc.line; } if(typeof loc.column !== 'undefined'){ msg += ':'+loc.column; } if(typeof loc.offset !== 'undefined'){ msg += ' (offset '+loc.offset+')'; } msg += '\n-----------------------------\n Grammar Definition:\n-----------------------------\n' + grammarDefinition; if(pegjs.printError){ pegjs.printError(msg); } else { console.error(msg); } msg = '[INVALID GRAMMAR] ' + msg + (error && error.name === 'SyntaxError' && error.stack? error.stack : ''); grammarParser = '{ parse: function(){ var msg = '+JSON.stringify(msg)+'; console.error(msg); throw msg;} }'; hasError = true; } return {def: grammarParser, hasError: hasError}; }, /** * @protected */ _preparePrintError: function(){ //setup logger for compile errors, if not already set if(! pegjs.printError){ /** * The default logging / error-print function for PEGjs. * * @private * @name printError * @function * @memberOf PegJsGenerator.pegjs# * * @see mmir.Logging */ pegjs.printError = function(){ var args = toArray(arguments); //prepend "location-information" to logger-call: args.unshift('PEGjs', 'compile'); //output log-message: logger.error.apply(logger, args); }; } }, /** * The default logging / error-print function for PEGjs. * * @protected * * @see mmir.Logging */ printError: function(){ if(pegjs.printError){ pegjs.printError.apply(pegjs, arguments); } else { console.error(arguments); } }, /** * Optional hook for pre-processing the generated parser, after the parser is generated. * * By default, this function returns VOID, in which case the parser-module is created by default. * * If a function is returned instead, then it must invoke <code>compileParserModuleFunc</code>: * <code>compileParserModuleFunc(compiledParser : STRING, hasErrors : BOOLEAN)</code> * * * @param {Function} compileParserModuleFunc * the function that generates the parser-module: * <code>compileParserModuleFunc(compiledParser : STRING, hasErrors : BOOLEAN)</code> * * @param {Function} compileCallbackFunc * the callback function which will be invoked by compileParserModuleFunc, after it has finished. * If compileParserModuleFunc() is prevented from exectution then the callback MUST be invoked manually * <code>compileCallbackFunc(theConverterInstance: GrammarConverter)</code> * * @returns {TRUTHY|VOID} * FALSY for the default behavior. * IF a TRUTHY value is returned, then the default action after compiling the parser * is not executed: * i.e. compileParserModuleFunc is not automatically called and in consequence the callback is not invoked * * * NOTE: if not FALSY, then either compileParserModuleFunc() must be invoked, or the callback() must be invoked! * * @protected */ _afterCompileParser: function(compileParserModuleFunc, compileCallbackFunc){ //default: return VOID return; } }; ////////////////////////////////////// PEG.js specific extensions to GrammarConverter //////////////////////////////// /** * PEGjs specific extension / implementation for {@link mmir.grammar.GrammarConverter} instances * * @type mmir.grammar.GrammarConverter * @memberOf PegJsGenerator# */ var PegJsGrammarConverterExt = { /** @memberOf PegJsGrammarConverterExt */ init: function(){ this.THE_INTERNAL_GRAMMAR_CONVERTER_INSTANCE_NAME = "theGrammarConverterInstance"; this._WHITESPACE_TOKEN_NAME = "WS"; this._PARTIAL_MATCH_PREFIX = "_r"; this.grammar_tokens = "/* --- Token definitions --- */\n\n/* Characters to be ignored */\n" + this._WHITESPACE_TOKEN_NAME +" = ' '/'\\t';\n\n/* Non-associative tokens */\n"; this.grammar_utterances = ""; this.grammar_phrases = "phrases\n = "; this.token_variables = "{\n var " + this.variable_prefix + "result = '';\n"; this.tokens_array = []; }, convertJSONGrammar: function(){ this.json_grammar_definition = this.maskJSON(this.json_grammar_definition); this.token_variables += " var semanticAnnotationResult = {};\n" //include some helper functions: + this.helper_func_flatten + this.helper_func_isarray + this.helper_func_tok + this.helper_func_offset + this.helper_func_index + this.helper_func_tokenList + this.helper_func_getTok; this.parseTokens(); this.parseUtterances(); this.parseStopWords(); this.grammar_definition = this.token_variables + "}\n\n" + "\n\n/* --- Grammar specification --- */\n\nutterance\n = phrases { " + "_logDebug(" + this.variable_prefix + "result); " + "semanticAnnotationResult.result = " + this.variable_prefix + "result; return "+ this.variable_prefix +"result;} ;\n\n" + this.grammar_utterances + "\n" + this.grammar_phrases + ";\n\n" + this.grammar_tokens; this.json_grammar_definition = this.unmaskJSON(this.json_grammar_definition); }, parseTokens: function(){ var self = this; var json_tokens = this.json_grammar_definition.tokens; var pref = self.variable_prefix; for(var token_name in json_tokens){ var words = json_tokens[token_name]; self.token_variables += " var " + pref + token_name.toLowerCase() + " = [];\n"; var sb = [token_name, "\n = _m:("]; var isNotRegExpr = true; for(var i=0, size = words.length; i < size ; ++i){ //NOTE RegExpr need to be recoded -> need to check, if current word is RegExp! // example (see also _convertRegExpr()): // INPUT: '[a-zA-Z_]+' // RECODED: [a-zA-Z_]+ isNotRegExpr = this._checkIfNotRegExpr(words[i]); if( isNotRegExpr ){ sb.push("'"); } //add TOKEN string: sb.push( isNotRegExpr? this._prepareToken(words[i]) : this._convertRegExpr(words[i])); if( isNotRegExpr ){ sb.push("'"); } //if there is another word following, add OR operator if(i < size-1){ sb.push("/"); } } //close assignment for "= match:(" and create JavaScript processing for token sb.push( ") { var res = _tok(" + pref + token_name.toLowerCase() + ", _m); return {" + this.entry_index_field + ": _offset(location())," + this.entry_type_field + ": '" + token_name.toLowerCase() + "'," + this.entry_token_field + ": res" +"}; };\n" ); self.grammar_tokens += sb.join(""); } }, //////////////// implementing/overriding BaseGenerator fields & functions: //////////////////////// //impl. abstract phrase_separator: "/", //impl. abstract phrase_match_var: "_m", //impl. abstract toUtteranceDeclarationHead: function(utteranceName){ return utteranceName + '\n = '; }, //impl. abstract toUtteranceDeclarationPhrase: function(_phrase, semanticInterpretation){ // /*phrase +*/ semantic_interpretation return semanticInterpretation; }, //impl. abstract addPhraseMatchForInterpretion: function(i, phraseList, phraseBuffer){ // //create STRING for phrase-matching // if(i > 0){ // phraseStr += " " + this._WHITESPACE_TOKEN_NAME + " "; // } // phraseStr += this._PARTIAL_MATCH_PREFIX + num + ":" + phraseList[i]; if(i > 0){ phraseBuffer.push(" " + this._WHITESPACE_TOKEN_NAME + " "); } phraseBuffer.push(this._PARTIAL_MATCH_PREFIX + (i+1) + ":" + phraseList[i]); }, //impl. abstract addPartialPhraseInterpretion: function(i, tempPhrasesVar, _phraseList, semanticProcBuffer){ // //create STR for semantic processing of phrase // semanticProcResult += this.temp_phrase_match_var + " = " + this._PARTIAL_MATCH_PREFIX + num + ";" // + utterance_name + "_temp['phrases'].push(" + this.temp_phrase_match_var + ");\n\t\t"; semanticProcBuffer.push( this.temp_phrase_match_var + " = " + this._PARTIAL_MATCH_PREFIX + (i+1) + ";" + tempPhrasesVar + "['phrases'].push(" + this.temp_phrase_match_var + ");\n\t\t" ); }, //impl. abstract toPhraseMatchResultForInterpretion: function(pharseMatchResultDef){ return "var " + this.phrase_match_var + " = {" + pharseMatchResultDef + "}"; }, //impl. abstract toPhraseInterpretion: function(phraseMatchStr, pharseMatchResult, semanticProcResult){ // return phraseStr + " {\n\t " + pharseMatchResult + "; " + semanticProcResult + "; return _m; \n\t} "; return phraseMatchStr + " {\n\t " + pharseMatchResult + "; " + semanticProcResult + "; return _m; \n\t} "; }, //additional (custom) helper function: helper_func_offset: " var _offset = function(pos, str){return pos.start.offset;};\n", //////////////// internal helpers: //////////////////////// _prepareToken: function(token){ //need to mask delimiting quotes, i.e. ' return token.replace(/'/g, "\\'"); }, _checkIfNotRegExpr: function(token){ //test for character-group if( ! /([^\\]\[)|(^\[).*?[^\\]\]/.test(token)){ //test for grouping if( ! /([^\\]\()|(^\().*?[^\\]\)/.test(token) ){ //try for single-characters that occur in reg-expr FIXME this may procude false-positives!!! return ! /[\?|\*|\+|\^|\|\\]/.test(token); //excluded since these may be more common in natural text: . $ } } return false; }, _convertRegExpr: function(token){ var sb = [], ch, last = null, isString = false, isGroup = false, isEsc = false, hasOr = false; for(var i=0, size = token.length; i < size; ++i){ ch = token.charAt(i); switch(ch){ case '(': case ')': case '[': case ']': case '+': case '*': case '?': case '$': case '^': case '.': case '|': if(last !== '\\'){ //if changed from STRING -> non-STRING, then "close" string first: if(isString){ //for "optional" expression: modify previous entry to be a single character-sequence // ...cars'? -> ...car' 's'? if(ch === '?' && sb.length > 0){//TODO also for '+', '*', ...??? sb[ sb.length - 1 ] = '\' \'' + sb[ sb.length - 1 ]; } sb.push("' "); isString = false; } //insert reg-expr symbol if(ch !== '|'){ sb.push(ch); } else { sb.push(' / '); hasOr = true; } //is character-group opening/closing? if(isGroup && ch === ']'){ isGroup = false; } else if(!isGroup && ch === '['){ isGroup = true; } break; } else { isEsc = true; } default: if(isEsc){ sb.splice(sb.length-1);//remove last element, i.e. the escape-character isEsc = false; } //if changed from non-STRING -> STRING, then "open" string now: if(!isGroup && !isString){ sb.push(" '"); isString = ! isGroup; } sb.push(ch); } last = ch; } //if last char was a STRING, "close" string now: if(isString){ sb.push("'"); } if(hasOr){ sb.unshift('('); sb.push(')'); } return sb.join(''); } }; return pegjsGen; });