UNPKG

mmir-lib

Version:

MMIR (Mobile Multimodal Interaction and Relay) library

github.com/mmig/mmir-lib

1,347 lines (1,166 loc) • 44.5 kB

JavaScript

/** * The GrammarConverter object initializes the grammar for processing * <em>natural language text</em>, e.g. from the voice recognition. * * @class * @name GrammarConverter * @memberOf mmir.grammar * * @requires util/loadFile * @requires util/isArray * @requires positionUtils * * @example * var GrammarConverter = new mmir.require('mmirf/grammarConverter'); * var gc = new GrammarConverter(); */ define(['mmirf/util/isArray', 'mmirf/util/loadFile', 'mmirf/positionUtils'], function(isArray, loadFile, posUtil){ /** * @ignore * * @constructs mmir.grammar.GrammarConverter */ function GrammarConverter(){ //regular expression for detecting encoded chars (see mask/unmask functions) this.enc_regexp_str = "~~([0-9|A-F|a-f]{4})~~"; this.grammar_definition = ""; this.js_grammar_definition = ""; this.json_grammar_definition = null; this.stop_words_regexp; //default setting for masking value Strings in JSON values (see maskJSON() / unmaskJSON) this.maskValues = true; //default setting for masking property-name Strings in JSON values (see maskJSON() / unmaskJSON) // WARNING: this is actually EXPERIMENTAL; it should be set to false, since JS/CC may not be able to handle masked ID names... this.maskNames = false; //if execution of the grammar is asynchronously done (i.e. result is delivered using a callback) this.is_async = false; //list of processing steps: // { // name: 'processing step ID', // pre: function(input, pos){...}, //OPTIONAL function for pre-processing // post: function(result, pos){...}} //OPTIONAL function for post-processing // } this.procList = []; this.pos_order_field = '_order'; this.initDefaultProc(); }; GrammarConverter.prototype.loadGrammar = function(successCallback, errorCallback, grammarUrl, doLoadSynchronously){ var self = this; var success = function(data, _status, xhr){ self.json_grammar_definition = data; if (typeof successCallback == "function") { successCallback.call(this, self, xhr); } }; var error = function(_xhr, _status, data){ if (typeof errorCallback == "function") { errorCallback.call(this, self); } else { console.error("failed to load the grammar! error: "+ JSON.stringify(data)); } }; this.loadResource(success, error, grammarUrl, doLoadSynchronously); }; GrammarConverter.prototype.loadResource = function(successCallback, errorCallback, resourceUrl, doLoadSynchronously){ var theUrl = resourceUrl; if(!theUrl){ console.error('GrammarConverter.loadResource: missing URL!'); if(errorCallback){ errorCallback.call(this, this); } return;///////////////// EARLY EXIT ////////////////////// } var isLoadAsync = false; if(typeof doLoadSynchronously !== 'undefined' && doLoadSynchronously === false){ isLoadAsync = true; } loadFile({ async: isLoadAsync, dataType: 'json', url:theUrl, success: successCallback, error: errorCallback }); }; GrammarConverter.prototype.setStopWords = function(stopWordArray){ if(!this.json_grammar_definition){ this.json_grammar_definition = {}; } this.json_grammar_definition.stopwords = this.maskJSON(stopWordArray); this.parseStopWords(); //use unmask-function in order to ensure masking/unmasking is reversible // (or in case it is not: the error will be held in property stop_word) this.json_grammar_definition.stopwords = this.unmaskJSON(this.json_grammar_definition.stopwords); }; GrammarConverter.prototype.getStopWords = function(){ var jsonGrammar = this.json_grammar_definition; if(!jsonGrammar){ return null; } var stopwords = jsonGrammar.stopwords; if(!stopwords && (stopwords = jsonGrammar.stop_word)){ console.warn('GrammarConverter.getStopWords: using deprecated field stop_word for stopword-list, should use field stopwords instead!'); } return stopwords; }; /** * HELPER creates a copy of the stopword list and encodes all non-ASCII chars to their unicode * representation (e.g. for save storage of stringified stopword list, even if file-encoding * does not support non-ASCII letters). * * @returns {Array<String>} a copy of the stopword list, from the current JSON grammar * (or empty list, if no grammar is present) */ GrammarConverter.prototype.getEncodedStopwords = function(){ var list = this.getStopWords(); if(!list){ return []; } //use copy, since recoding works in-place (we do not want to modify the stored stopword list here) list = list.slice(0, list.length); //store stopwords with their Unicode representation (only for non-ASCII chars) return this.recodeJSON( list, this.maskAsUnicode ); }; //this is the original / main implementation for creating the RegExp for stopword removal GrammarConverter.prototype.parseStopWords = function(){ //create RegExp for stop words: var json_stop_words = this.getStopWords(); var size = json_stop_words.length; var stop_words = ""; //FIX for encoded chars: if a word begins or ends with an encoded char, \b cannot detect the word's boundaries // -> FIX if we encounter such words, create a separate RegExpr that uses // whitespaces & START-/END-expression for detecting word-boundaries, i.e. something like: (\s|^)(~~ ... words ... ~~)(\s|$) // // NOTE: the word-boundaries expression \b seems to have no effect in case of non-ASCII chars in general // (e.g. for Japanese characters / words) // .... so we would need to use this alternative mechanism (e.g. using whitespaces & START-/END-expr.) // even if these characters were not encoded! var encStartTester = new RegExp("^" + this.enc_regexp_str ,"gm"); var encEndTester = new RegExp( this.enc_regexp_str + "$","gm"); var enc_stop_words = ""; var isEncWord = function(str){ return encStartTester.test(str) || encEndTester.test(str); }; if(size > 0){ //... then the RegExp matches each stopword: for(var index=0; index < size ; ++index){ var stop_word = json_stop_words[index]; //special treatment for word that begin/end with encoded chars: if(isEncWord(stop_word)){ if(enc_stop_words.length === 0){ enc_stop_words = "(\\s|^)("; } else { enc_stop_words += "|"; } enc_stop_words += stop_word; continue; } //... for "normal" stopwords: if (stop_words.length > 0){ stop_words += "|"; //... if there is already a previous stopword-entry: do add OR-matching ... } stop_words += stop_word; //... add the stopword "stop_word" } } if(stop_words.length > 0){ stop_words = "\\b(" //starting at a word-boundary (-> ignore within-word matches) + stop_words + ")" + "\\b" //... ending with a word-boundary -> avoid "cutting out" matching partial strings // e.g. without \b: '(in)\s?' would match (and cut out all matches) within "winning" -> "wng" + "\\s?"; //... and optionally: one white-character that follows the stopword } else { //for empty stopword definition: match empty string // (basically: remove nothing) stop_words += '^$'; } this.stop_words_regexp = new RegExp(stop_words,"igm"); //RegExp options: // ignore-case (i), // match globally i.e. all occurrences in the String (g), // do not stop at line breaks (m) //only create ReExp for special stopwords, if we actually have at least 1 of those: //NOTE for replacement, we need to use a space-char (i.e. replace these with spaces, not empty strings: str.replace(..., ' '); ) if(enc_stop_words.length > 0){ enc_stop_words += ")(\\s|$)"; this.stop_words_regexp_enc = new RegExp(enc_stop_words,"igm"); } }; GrammarConverter.prototype.getStopWordsRegExpr = function(){ if(!this.stop_words_regexp){ this.parseStopWords(); } return this.stop_words_regexp; }; /** * FIX for stopwords that start or end with encoded chars (i.e. non-ASCII chars) * * This RegExp may be NULL/undefined, if no stopwords exist, that begin/end with encoded chars * i.e. you need to check for NULL, before trying to use this RegExpr. * * Usage: * @example * * //remove normal stopwords: * var removedStopwordsStr = someStr.replace( gc.getStopWordsRegExpr(), ''); * * * var removedStopwordsStr2 = removedStopwordsStr; * if(gc.getStopWordsEncRegExpr()){ * //NOTE replace stopwords with spaces (not with empty String as above, ie. with "normal" stopwords) * removedStopwordsStr2 = gc.getStopWordsEncRegExpr().replace( gc.getStopWordsEncRegExpr(), ' '); * } */ GrammarConverter.prototype.getStopWordsEncRegExpr = function(){ if(!this.stop_words_regexp){ this.parseStopWords(); } return this.stop_words_regexp_enc; }; /** * Get grammar definition text. * * This is the "source code" input for the grammar compiler * (i.e. syntax for jison, PEG.js or JS/CC). * * The grammar definition text is generated from the JSON grammar. * * @returns {String} the grammar definition in compiler-specific syntax */ GrammarConverter.prototype.getGrammarDef = function(){ return this.grammar_definition; }; /** * Sets the grammar definition text. * * This function should only be used during compilation of the JSON grammar * to the executable grammar. * * NOTE: Setting this "manually" will have no effect on the executable grammar. * * @see #getGrammarDef * @protected * * @param {String} rawGrammarSyntax * the grammar definition in compiler-specific syntax */ GrammarConverter.prototype.setGrammarDef = function(rawGrammarSyntax){ this.grammar_definition = rawGrammarSyntax; }; /** * Get the compiled JavaScript grammar source code. * * This is the output of the grammar compiler (with additional * JavaScript "framing" in {@link mmir.SemanticInterpreter#createGrammar}). * * This needs to be eval'ed before it can be executed (eval() will add * the corresponding executable grammar to SemanticInterpreter). * * @returns {String} the compiled, JavaScript grammar source code */ GrammarConverter.prototype.getGrammarSource = function(){ return this.js_grammar_definition; }; GrammarConverter.prototype.setGrammarSource = function(src_code){ this.js_grammar_definition = src_code; }; /** * Set the executable grammar function. * * The grammar function takes a String argument: the text that should be parsed. * a Function argument: the callback for the result. * where the callback itself takes 1 argument for the result: <code>callback(result)</code> * * The returned result depends on the JSON definition of the grammar: * <code>func(inputText, resultCallback)</code> * * * @param {Function} func * the executable grammar function: <code>func(string, object, function(object)) : object</code> * @param {Boolean} [isAsnc] OPTIONAL * set to TRUE, if execution is asynchronously done. * DEFAULT: FALSE * * @see #exectueGrammar */ GrammarConverter.prototype.setGrammarFunction = function(func, isAsync){ this.is_async = !!isAsync; this.executeGrammar = func; }; GrammarConverter.prototype.isAsyncExec = function(){ return this.is_async; }; /** * * @param {String} thePhrase * the string from which to remove stopwords (and trim()'ed) * @param {Boolean} [computePositions] OPTIONAL * DEFAULT: false * * @returns {String|{str: String, pos: ARRAY<Position>}} * the string where stopwords were removed, or if <code>computePositions</code> was <code>true</code> * a result object where the positions at which stopwords were removed will be available as an array: * <pre> * { * text: STRING, // the string with removed stopwords * pos: [POSITION] // array of positions for removed stopwords: {i: NUMBER, len: NUMBER, mlen: NUMBER} * } * </pre> * where POSITION is an object with * <pre> * { * i: NUMBER, // the index within the modified string * len: NUMBER, // the length before the modification (i.e. of sub-string that is to be masked) * mlen: NUMBER // the length after the modification (i.e. of sub-string that that was masked) * } * </pre> * * @returns {String} * the string where stopwords were removed */ GrammarConverter.prototype.removeStopwords = function(thePhrase, computePositions){ var stop_words_regexp = this.getStopWordsRegExpr(); var str = thePhrase; var positions = computePositions? [] : void(0); var replStr,//<- replacement string used in removeFunc appendPos,//<- controls if position-info should append or prepended to position-list replOffset,//<- global offset (i.e. offset with regard to input string thePhrase) iCalc,//<- helper index for calculating offset in modified strings calcPos,//<- helper function for calculating offset in modified strings replPositions,//<- helper/temporary positions-array for calculating offset in modified strings removeFunc;//<- replacement-function that also tracks the positions that were modified (via argument positions) if(computePositions){ //initialize helpers for tracking positions replOffset = 0; iCalc = 0; appendPos = true; removeFunc = function(){//HELPER for matched stopwords: log its position and remove it var argLen = arguments.length; var match = arguments[0]; var offset = arguments[argLen-2]; var index = calcPos(offset); // //FIXM DEBUG // var word = argLen === 4? arguments[1] : (argLen === 6? arguments[2] : 'WHITESPACE'); // var start = index; // var end = start + match.length; // var isError = word !== 'WHITESPACE'? thePhrase.substring(start, end).trim() !== word : !/\s+/.test(thePhrase.substring(start, end)); // console[isError? 'error' : 'log']('matched "'+match+'" -> found stopword "'+word+'" from '+start+' to '+end+ ' -> "'+thePhrase.substring(start, end)+'"'); //// console.log(' stopword-removal: ', arguments); // //FIXM DEBUG END if(appendPos){ positions.push({i: index, mlen: match.length, len: replStr.length}); } else { positions.unshift({i: index, mlen: match.length, len: replStr.length}); } return replStr; }; calcPos = function(offset){ if(!replPositions){ return offset; } var pos; for(var size = replPositions.length; iCalc < size; ++iCalc){ pos = replPositions[iCalc]; if(pos.i > offset + replOffset){ break; } replOffset += pos.mlen - pos.len; } return offset + replOffset; }; } var encoded_stop_words_regexp = this.getStopWordsEncRegExpr(); replStr = ' '; if(encoded_stop_words_regexp){ // console.log('_______STOPWORD-rem-enc: "'+str+'"');//FIXM DEBUG str = str.replace(this.stop_words_regexp_enc, computePositions? removeFunc : replStr); if(computePositions){ //update helper variables for calculating global offset (after string was modified): replOffset = 0; iCalc = 0; replPositions = positions.slice(0); } } // console.log('_______STOPWORD-rem: "'+str+'"');//FIXM DEBUG replStr = ''; str = str.replace(stop_words_regexp, computePositions? removeFunc : replStr); if(computePositions){ positions.sort(function(a,b){return a.i - b.i;});//<- positions may not be ordered, if encoded_stop_words_regexp was applied //update helper variables for calculating global offset (after string was modified): replOffset = 0; iCalc = 0; replPositions = positions.slice(0); } if(computePositions){ //trim with tracking of positions // console.log('_______STOPWORD-rem-ws: "'+str+'"');//FIXM DEBUG replStr = ''; str = str.replace(/\s+$/, removeFunc);//<- trim at end positions.sort(function(a,b){return a.i - b.i;});//<- positions may not be ordered, if words were removed from the end of the string //update helper variables for calculating global offset (after string was modified): replOffset = 0; iCalc = 0; replPositions = positions.slice(0); appendPos = false;//<- prepending "start-trimming"-position may not be accurate, but should be "nearly" correct (w.r.t. to ordering by index pos.i) str = str.replace(/^\s+/, removeFunc);//<- trim at beginning positions.sort(function(a,b){return a.i - b.i;});//<- positions may not be ordered, if words were removed from the beginning of the string // console.log('_______STOPWORD-positions: "'+JSON.stringify(positions)+'"');//FIXM DEBUG } else { str = str.trim(); } // console.log(JSON.stringify(str));//FIXM DEBUG return computePositions? {text: str, pos: positions} : str; }; /** * Apply pre-processing to the string, before applying the grammar: * * escape (i.e. "mask") non-ASCI characters * * remove stopwords * * {@link #addProc} can be used to add additional pre-/post-processing steps * * @param {String} thePhrase * @param {PlainObject} [pos] OPTIONAL * in/out argument: if given, the pre-processor will add fields with information * on how the input string <code>thePhrase</code> was modified * By default the position information for escaped characters and removed stopwords will be added to * <code>pos.escape</code> (see {@link #maskString} for more details) * <code>pos.stopwords</code> (see {@link #removeStopwords} for more details) * And the field <code>pos._order</code> will contain the ordered list of pre-processing steps that where applied * i.e. the enries correspond to the field names, e.g. by default the list would contain <code>['escape', 'stopwords']</code> * @param {Array<ProcessingStep>} [processingSteps] OPTIONAL * if given, use <code>processingSteps</code> instead of (field) <code>procList</code> * NOTE positional argument (i.e. must specify <code>pos</code> too) * * * @returns {String} the pre-processed string * * @see #addProc * @see #removeProc * @see #getProcIndex * @see #procList */ GrammarConverter.prototype.preproc = function(thePhrase, pos, processingSteps){ var proc, res = thePhrase, list = processingSteps || this.procList; for(var i=0, size=list.length; i < size; ++i){ proc = list[i]; if(proc.pre){ res = proc.pre.call(this, res, pos); if(pos && typeof res === 'object'){ if(pos[this.pos_order_field]) pos[this.pos_order_field].push(proc.name); else pos[this.pos_order_field] = [proc.name]; pos[proc.name] = res.pos; } } } if(typeof res === 'object'){ if(pos){ posUtil.recalcProcPos(pos); } return res.text; } return res; }; /** * Post-processes the result from the applied grammar: * * un-masks non-ASCI characters * * {@link #addProc} can be used to add additional pre-/post-processing steps * * @param {SemanticResult} procResult * @param {Positions} pos * the position information (i.e. modifications) of the pre-processing steps * @param {Array<ProcessingStep>} [processingSteps] OPTIONAL * if given, use <code>processingSteps</code> instead of (field) <code>procList</code> * NOTE positional argument (i.e. must specify <code>pos</code> too) * * @see #addProc * @see #removeProc * @see #getProcIndex * @see #procList */ GrammarConverter.prototype.postproc = function(procResult, pos, processingSteps){ var proc, res = procResult, list = processingSteps || this.procList; for(var i=list.length - 1; i >= 0; --i){ proc = list[i]; if(proc.post){ res = proc.post.call(this, res, pos); } } return res; }; /** * add pre-/post-processing step for running before/after {@link #executeGrammar} * * @param {ProcessingStep} proc the processing step: * <pre> * { * //the name of the processing step * name: string, * //OPTIONAL pre-processing function: pre(input: string | Positions, isCalcPos: boolean) * pre: Function, * //OPTIONAL post-processing function: post(result: any, pos: Positions) * post: Function * } * </pre> * @param {Boolean|Number} [isPrepend] OPTIONAL * if omitted (or FALSY): appended <code>proc</code> to processing steps * if number: insert <code>proc</code> at this index into the processing steps-list * if TRUE: prepend <code>proc</code> to processing steps * * @see #removeProc * @see #getProcIndex * @see #procList * @see mmir.grammar.stemmer * @example * //poitionUtils: * var posUtil = mmir.require('mmirf/positionUtils'); * //stemming function * var stemFunc = ...; * //add stemming function for pre-processing as first step * grammarConverter.addProc({ * name: 'stem', * pre: posUtil.createWordPosPreProc(stem, this) * }, true); */ GrammarConverter.prototype.addProc = function(proc, isPrepend){ if(proc.name === this.pos_order_field){ throw new Error('processing step must not be named "'+this.pos_order_field+'"'); } if(typeof isPrepend === 'number'){ this.procList.splice(isPrepend, 0, proc); } else if(isPrepend){ this.procList.unshift(proc); } else { this.procList.push(proc); } }; /** * remove a processing step by its index (within {@link #procList}) or its name * * NOTE: if multiple processing steps with the same name exist, the last one is removed * * @param {Number|String} proc the name or index of the processing step that should be removed * @return {ProcessingStep} the removed processing step, or undefined, * if there was no matchin processing step * * @see #addProc * @see #getProcIndex * @see #procList */ GrammarConverter.prototype.removeProc = function(proc){ if(typeof proc === 'number'){ return this.procList.splice(proc, 1)[0]; } else { var i = this.getProcIndex(proc); if(i !== -1){ return this.procList.splice(i, 1)[0]; } } return void(0); }; /** * remove a processing step by its index (within {@link #procList}) or its name * * NOTE: if multiple processing steps with the same name exist, the first one is removed * * @param {String} proc the name of the processing step * @param {Number} [startIndex] OPTIONAL start index for searching (DEFAULT: 0) * @return {Number} the index of the processing step, or -1, if there is no such processing step * * @see #addProc * @see #removeProc * @see #procList */ GrammarConverter.prototype.getProcIndex = function(procName, startIndex){ startIndex = startIndex || 0;//NOTE if startIndex is 0, or'ed value (0) is also valid for(var i=startIndex, size = this.procList.length; i < size; ++i){ if(this.procList[i].name === procName){ return i; } } return -1; }; /** * initialize default pre- and post-processing steps: * * * "escape": escape/unescape special characters (see {@link #maskString}, {@link #unmaskString})<br/> * * "stopwords": remove stopwords (see {@link #removeStopwords}) * * @private * @see #addProc * @see #removeProc * @see #getProcIndex * @see #procList */ GrammarConverter.prototype.initDefaultProc = function(){ this.addProc({ name: 'escape', pre: posUtil.createPosPreProc(this.maskString, this), post: function(procResult, _pos){ return this.unmaskJSON(procResult); } }); this.addProc({ name: 'stopwords', pre: posUtil.createPosPreProc(this.removeStopwords, this) }); }; /** * Execute the grammar. * * NOTE: do not use directly, but {@link mmir.SemanticInterpreter#interpret} instead, * since that function applies some pre- and post-processing to the text (stopword removal * en-/decoding of special characters etc.). * * @param {String} text * the text String that should be parse. * @param {Object} [options] * additional parsing options (some grammar engines may support further options) * options.debug: BOOLEAN enable printing debug information * options.trace: BOOLEAN | FUNCTION enable printing verbose/tracing information (may not be supported by the grammar engine) * @param {Function} [callback] * if #isAsyncExec is TRUE, then executeGrammar will have no return value, but instead the result * of the grammar execution is delivered by the <code>callback</code>: * <pre>function callback(result){ ... }</pre> * (see also description of <code>return</code> value below) * @returns {Object} * the result of the grammar execution: * <pre>{phrase: STRING, phrases: OBJECT[], semantic: OBJECT}</pre> * * The property <code>phrase</code> contains the <code>text</code> which was matched (with removed stopwords). * * The property <code>phrases</code> contains the matched <tt>TOKENS</tt> and <tt>UTTERANCES</tt> from * the JSON definition of the grammar as properties as arrays * (e.g. for 1 matched TOKEN "token": <code>{token: ["the matched text"]}</code>). * * The returned property <code>semantic</code> depends on the JSON definition of the grammar. * * NOTE: if #isAsyncExec is TRUE, then there will be no return value, but instead the callback * is invoked with the return value. * */ GrammarConverter.prototype.executeGrammar = function(text, options, callback){ console.warn('GrammarConverter.executeGrammar: this is only a stub. No grammar implementation set yet, ignoring executeGrammar() with arguments', text, options, callback); }; //TODO move masking/recoding functions to separate utility module? /** * Masks unicoded characters strings. * * Unicode characters are mask by replacing them with * <code>~~XXXX~~</code> * where <code>XXXX</code> is the four digit unicode HEX number. * * <p> * NOTE that this function is <em>stable</em> with regard to * multiple executions: * * If the function is invoked on the returned String again, the * returned String will be the same / unchanged, i.e. * maskings (i.e. "~~XXXX~~") will not be masked again. * </p> * <p> * NOTE: currently, the masking pattern cannot be escaped, * i.e. if the original String contains a substring * that matches the masking pattern, it cannot * be escaped, so that the unmask-function * will leave it untouched. * </p> * * @param {String} str * the String to process * @param {Boolean} [computePositions] OPTIONAL * DEFAULT: false * @param {String} [prefix] OPTIONAL * an alternative prefix used for masking, i.e instead of <code>~~</code> * (ignored, if argument has other type than <code>string</code>) * @param {String} [postfix] OPTIONAL * an alternative postfix used for masking, i.e instead of <code>~~</code> * (ignored, if argument has other type than <code>string</code>) * @returns {String|{str: String, pos: ARRAY<Position>}} * the masked string, or if <code>computePositions</code> was <code>true</code> * a result object with * <pre> * { * text: STRING, // the masked string * pos: [POSITION] // array of maskink-positions: {i: NUMBER, len: NUMBER, mlen: NUMBER} * } * </pre> * where POSITION is an object with * <pre> * { * i: NUMBER, // the index within the modified string * len: NUMBER, // the length before the modification (i.e. of sub-string that is to be masked) * mlen: NUMBER // the length after the modification (i.e. of sub-string that that was masked) * } * </pre> */ GrammarConverter.prototype.maskString = function (str, computePositions, prefix, postfix) { var i, ch, peek, result, next, endline, push, mask, source = str; var positions, esclen;//<- will only be used, if computePositions === TRUE //shift arguments if necessary if(typeof computePositions === 'string'){ postfix = prefix; prefix = computePositions; computePositions = false; } var ESC_START = typeof prefix === 'string'? prefix : '~~'; var ESC_END = typeof postfix === 'string'? postfix : '~~'; // Stash the next character and advance the pointer next = function () { peek = source.charAt(i); i += 1; }; // Start a new "line" of output, to be joined later by <br /> endline = function () { result.push('\n'); }; mask = function (theChar) { if(computePositions){ //store position information for the masking: // i: position in original string // len: modified length of the string, i.e. the length of masking string // mlen: original length of the string, i.e. the length of the string that will get masked (in this case it is always 1, i.e. 1 char) positions.push({i: i-2, len: esclen, mlen: theChar.length});//<needed?:> , start: result.length});//<- would need to compute the actual position from current result-buffer content... } result.push(ESC_START); var theUnicode = theChar.charCodeAt(0).toString(16).toUpperCase(); var j = theUnicode.length; while (j < 4) { result.push('0'); ++j; } result.push(theUnicode); result.push(ESC_END); }; // Push a character or its entity onto the current line push = function () { //handle NEWLINE: if (ch === '\r' || ch === '\n') { if (ch === '\r') { if (peek === '\n') { next(); } endline(); } if (ch === '\n') { if (peek === '\r') { next(); } endline(); } } //handle tabs else if (ch === '\t') { result.push(ch); } //handle NON-ASCII else if (ch < ' ' || ch > '~') { mask( ch ); } //handle normal chars else { result.push(ch); } }; result = []; if(computePositions){ esclen = ESC_START.length + 4 + ESC_END.length; positions = []; } i = 0; next(); while (i <= source.length) { // less than or equal, because i is always one ahead ch = peek; next(); push(); } // //FIXM DEBUG: show position-logging for masking // if(computePositions && positions.length > 0){ // console.log('_______LOG-mask-pos("'+str+'" -> "'+result.join('')+'"): '); // var lres = result.join(''); // var loffset = 0; // for(var li = 0, lsize = positions.length; li < lsize; ++li){ // var lpos = positions[li]; // console.log(' '+JSON.stringify(lpos) + ' "'+str.substring(lpos.i, lpos.i + 1)+'" -> "'+lres.substring(loffset + lpos.i, loffset + lpos.i +lpos.len )+'"'); // loffset += lpos.len - 1; // } // }//END: DEBUG if(computePositions){ return {text: result.join(''), pos: positions}; } return result.join(''); }; /** * HELPER uses #maskString for encoding non-ASCII chars to their Unicode representation, * i.e. <code>\uXXXX</code> where XXXX is the Unicode HEX number. * * * SHORTCUT for calling <code>maskString(str, '\\u', '')</code>. * * @param {String} str the string for unicode masking * @param {Boolean} [computePositions] OPTIONAL * DEFAULT: false * @returns {String|{str: String, pos: ARRAY<Position>}} * the unicode-masked string, or if <code>computePositions</code> was <code>true</code> * a result object with * <pre> * { * text: STRING, // the masked string * pos: [POSITION] // array of maskink-positions: {i: NUMBER, len: NUMBER, mlen: NUMBER} * } * </pre> * where POSITION is an object with * <pre> * { * i: NUMBER, // the index within the modified string * len: NUMBER, // the length before the modification (i.e. of sub-string that is to be masked) * mlen: NUMBER // the length after the modification (i.e. of sub-string that that was masked) * } * </pre> * * @example * //for Japanese "下さい" ("please") * maskAsUnicode("下さい") // -> "\u4E0B\u3055\u3044" * * //... and using default masking: * maskString("下さい") // -> "~~4E0B~~~~3055~~~~3044~~" */ GrammarConverter.prototype.maskAsUnicode = function (str, computePositions) { return this.maskString(str, computePositions, '\\u', ''); }; /** * Unmasks <i>masked unicoded characters</i> in a string. * * Masked unicode characters are assumed to have the pattern: * <code>~~XXXX~~</code> * where <code>XXXX</code> is the four digit unicode HEX number. * * <p> * NOTE that this function is <em>stable</em> with regard to * multiple executions, <b>IF</b> the original String <tt>str</tt> did not * contain a sub-string that conforms to the encoding pattern * (see remark for {@link #maskString}): * * If the function is invoked on the returned String again, the * returned String will be the same, i.e. unchanged. * </p> * * @param {String} str * @param {Boolean} [computePositions] OPTIONAL * DEFAULT: false * @param {RegExp} [detector] OPTIONAL * an alternative detector-RegExp: * the RegExp must conatin at least one grouping which detects a unicode number (HEX), * e.g. default detector is <code>~~([0-9|A-F|a-f]{4})~~</code> (note the grouping * for detecting a 4-digit HEX number within the brackets). * @returns {String|{str: String, pos: ARRAY<Position>}} * the masked string, or if <code>computePositions</code> was <code>true</code> * a result object with * <pre> * { * text: STRING, // the masked string * pos: [POSITION] // array of maskink-positions: {i: NUMBER, len: NUMBER, mlen: NUMBER} * } * </pre> * where POSITION is an object with * <pre> * { * i: NUMBER, // the index within the modified string * len: NUMBER, // the length before the modification (i.e. of sub-string that is to be masked) * mlen: NUMBER // the length after the modification (i.e. of sub-string that that was masked) * } * </pre> */ GrammarConverter.prototype.unmaskString = function (str, computePositions, detector) { var match, mlen, ch, positions, source = str, result = [], pos = 0, i, len = str.length; //shift arguments if necessary if(typeof computePositions === 'object'){ detector = computePositions; computePositions = false; } if(computePositions){ positions = []; } //RegExpr for: ~~XXXX~~ // where XXXX is the unicode HEX number: ~~([0-9|A-F|a-f]{4})~~ var REGEXPR_ESC = detector? detector : new RegExp( this.enc_regexp_str, "igm"); while(match = REGEXPR_ESC.exec(source)){ i = match.index; mlen = match[0].length; //add previous: if(i > pos){ result.push(source.substring(pos, i)); } //add matched ESC as UNICODE: ch = String.fromCharCode( parseInt(match[1], 16) ); result.push(ch); //update position: pos = i + mlen; if(computePositions){ //store position information for the masking: // i: position in original string // len: modified length of the string, i.e. the length of the unmasked string // mlen: original length of the string, i.e. the length of the masked string, that will get unmasked positions.push({i: i, len: ch.length, mlen: mlen}); } } if(pos < len){ result.push(source.substring(pos)); } // //FIXM DEBUG: show position-logging for masking // if(computePositions && positions.length > 0){ // console.log('--------LOG-UNMASK-pos("'+str+'" -> "'+result.join('')+'"): '); // var lres = result.join(''); // var loffset = 0; // for(var li = 0, lsize = positions.length; li < lsize; ++li){ // var lpos = positions[li]; // console.log(' '+JSON.stringify(lpos) + ' "'+str.substring(lpos.i, lpos.i + lpos.mlen)+'" -> "'+lres.substring(loffset + lpos.i, loffset + lpos.i + lpos.len)+'"'); // loffset += lpos.len - lpos.mlen; // } // }//END: DEBUG if(computePositions){ return {text: result.join(''), pos: positions}; } return result.join(''); }; GrammarConverter.prototype.maskJSON = function (json, isMaskValues, isMaskNames) { return this.recodeJSON(json, this.maskString, isMaskValues, isMaskNames); }; GrammarConverter.prototype.unmaskJSON = function (json, isMaskValues, isMaskNames) { return this.recodeJSON(json, this.unmaskString, isMaskValues, isMaskNames); }; /** * Recodes Strings of a JSON-like object. * * @function * @param {Object} json * the JSON-like object (i.e. PlainObject) * * @param {Function} recodeFunc * the "recoding" function for modifying String values: * must accecpt a String argument and return a String * <code>String recodeFunc(String)</code>. * The <tt>recodeFunc</tt> function is invoked in context of the GrammarConverter object. * Example: this.maskString(). * See {@link #maskString}.k * * @param {Boolean} [isMaskValues] OPTIONAL * if true, the object's property String values will be processed * NOTE: in case this parameter is specified, then <code>recodeFunc</code> must * also be specified! * DEFAULT: uses property {@link #maskValues} * @param {Boolean} [isMaskNames] OPTIONAL * if true, the property names will be processed * NOTE: in case this parameter is specified, then <code>recodeFunc</code> and * <code>isMaskValues</code> must also be specified! * DEFAULT: uses property {@link #maskNames} * * @returns {Object} the recoded JSON object * * @requires util/isArray */ GrammarConverter.prototype.recodeJSON = (function (isArray) {//<- NOTE this is only the initializer (i.e. see returned function below) /** * HELPER for sorting position objects * * @private */ var sortPosFunc = function(pos1, pos2){ return pos1.target.i - pos2.target.i; }; /** * HELPER for setting a recoded string value * * @param {StringResult|String} recodedVal * the recoding-result: * <pre>{str: STRING, pos: ARRAY<POSITION>}</pre> * * If undefined, nothing will be done * * @param {String} origVal * the original string value (i.e. "un-recoded") * * @param {Object} obj * the parent-object for the recoded string property * * @param {String} pname * the property name in the parent-object for the recoded string property * * @param {Array<Position>} [recodedPositions] OPTIONAL * if present, the modification information of the recoding will be added to the array * The elements of the array: * <pre> * { * target: Token, // the token that was modified/recoded * mlen: NUMBER // the length of the un-modified string (i.e. before recoding) * } * </pre> * where Token: * <pre> * { * i: NUMBER, // the index of the token w.r.t. to the input string * tok: STRING, // the (recoded/modified) token * } * </pre> * @private */ var setRecodedVal = function(recodedVal, origVal, obj, pname, recodedPositions){ var recVal; if(typeof recodedVal === 'string'){ recVal = recodedVal; } else if(typeof recodedVal !== 'undefined' && typeof recodedVal.text === 'string'){ recVal = recodedVal.text; } //only set, if there was a recoding: if(typeof recVal !== 'undefined' && typeof recVal === 'string'){ if(origVal !== recVal){ //set recoded value var str = recVal; obj[pname] = str; } //special treatment for token-objects, i.e. // { // tok: STRING, // i: NUMBER // } // // -> store some information for recalculating the index, in case tokens were recoded if(pname === 'tok' && typeof obj.i === 'number'){ // var offset = 0; // var pos; // for(var i=recodedVal.pos.length-1; i >= 0; --i){ // pos = recodedVal.pos[i]; // offset += pos.mlen - pos.len; // } var modLen = origVal.length;// offset + str.length; // if(offset + str.length !== origVal.length){ // console.error('ERROR: unexpected length!!!!'); // } // obj.len = origVal.length - offset; // if(obj.len !== obj.tok.length){ // console.error('ERROR: unexpected length!!!!'); // } if(recodedPositions){ recodedPositions.push({target: obj, mlen: modLen});//, i: start});//recodedVal); } } } }; /** * HELPER for adjusting the index-information in token-objects of an SemanticResult * (w.r.t. recoded tokens). * * @param {Array} recodedPositions * the list with modification information w.r.t. the tokens (as created by setRecodedVal) * * @see #setRecodedVal * @private */ var recalculatePos = function(recodedPositions){ if(recodedPositions && recodedPositions.length > 0){ // console.log('__________RECODE_pre-sort__'+JSON.stringify(recodedPositions));//FIXM DEBUG recodedPositions.sort(sortPosFunc); // console.log('__________RECODE_post-sort_'+JSON.stringify(recodedPositions));//FIXM DEBUG var repos, token; var offset = 0; for(var i=0, size = recodedPositions.length; i < size; ++i){ repos = recodedPositions[i]; token = repos.target; token.i -= offset; offset += repos.mlen - token.tok.length; } } }; /** * Recursive processing for an object / recoding a JSON-like object. * NOTE: the recoding happens "in-place", i.e. the object itself is modified * * See doc of recodeJSON() for details w.r.t. the arguments * * NOTE: argument recodedPositions is an internal (OPTIONAL) parameter * that is used when recoding SemanticResult objects (applied grammar) * * @returns {PlainObject} the object where its string-values are recoded * @private */ var processJSON = function(obj, recodeFunc, isMaskValues, isMaskNames, recodedPositions){ //different treatments for: STRING, ARRAY, OBJECT types (and 'REST' type, i.e. all others) if(typeof obj === 'string' && isMaskValues){ //STRING: encode the string return recodeFunc.call(this, obj, true); } else if( isArray(obj) ) { //ARRAY: process all entries: for(var i=0, size = obj.length; i < size; ++i){ var pv = obj[i]; var pvn = processJSON.call(this, pv, recodeFunc, isMaskValues, isMaskNames, recodedPositions); setRecodedVal(pvn, pv, obj, i, recodedPositions); } return obj; } else if(obj === null) {//NOTE null is typeof object! return null; } else if(typeof obj === 'object') { //OBJECT: process all the object's properties (but only, if they are not inherited) for(var p in obj){ if(obj.hasOwnProperty(p)){ var pv = obj[p]; //special treatment for token-lists, i.e. elements like: // // phrases: [ // { // tok: STRING | ARRAY<TOK>, // type: STRING, // i: NUMBER // }, // ... // ] // // -> create list for storing some information for recalculating the index, in case tokens were recoded var isCalcPos = false; if(!recodedPositions && p === 'phrases' && typeof pv === 'object' && pv){// typeof pv.i === 'number' && typeof pv.tok === 'string'){ isCalcPos = true; recodedPositions = []; } var pvn = processJSON.call(this, pv, recodeFunc, isMaskValues, isMaskNames, recodedPositions); setRecodedVal(pvn, pv, obj, p, recodedPositions); if(isCalcPos){ recalculatePos(recodedPositions); recodedPositions = void(0); } //if the property-name should also be encoded: if(typeof p === 'string' && isMaskNames){ var masked = recodeFunc.call(this, p); if(masked && typeof masked.text === 'string' && masked.text !== p){ obj[masked.text] = obj[p]; delete obj[p]; } } } } return obj; } else { return obj; } }; return function (json, recodeFunc, isMaskValues, isMaskNames){ //evaluate arguments: if(typeof isMaskValues === 'undefined'){ isMaskValues = this.maskValues; } if(typeof isMaskNames === 'undefined'){ isMaskNames = this.maskNames; } return processJSON.call(this, json, recodeFunc, isMaskValues, isMaskNames); }; })(isArray);//<- dependency util/isArray return GrammarConverter; });//END: define(..., function(){