wink-nlp
Version:
Developer friendly Natural Language Processing ✨
506 lines (472 loc) • 20.3 kB
JavaScript
// wink-nlp
//
// Copyright (C) GRAYPE Systems Private Limited
//
// This file is part of “wink-nlp”.
//
// Permission is hereby granted, free of charge, to any
// person obtaining a copy of this software and
// associated documentation files (the "Software"), to
// deal in the Software without restriction, including
// without limitation the rights to use, copy, modify,
// merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice
// shall be included in all copies or substantial
// portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
/* eslint-disable no-console */
/* eslint-disable guard-for-in */
const composePatterns = require( './compose-patterns.js' );
const identifyMarkedArea = require( './identify-marked-area.js' );
const eosTokenN = 2070000;
const eosTokenX = '$%^EoS^%$';
const otherwiseN = 2070003;
const otherwiseX = ' otherwise';
var simpleFSM = function ( cache, token2Ignore ) {
// Returned!
var methods = Object.create( null );
// Holds FSM in the following structure:<br/>
// curr state —> event —> next state <br/>
// One of the event is `otherwise`, whose next state defines the default state.
var fsm = Object.create( null );
// The root or the beginning state of the `fsm`.
const root = 0;
// Tracks the last used state. Whenever a new state is needed, its value is
// incremented and returned. See `getNextState()`.
var lastUsedState = 0;
// The terminal states i.e. the detected patterns: maps state to name.
var terminalStates = Object.create( null );
// The terminal states, where part of pattern has been marked out.
var markedStates = Object.create( null );
// Add-ons value is stored here.
var customPropertyAtStates = Object.create( null );
// Use to substitute tokens by patterns in a multi-pass scenario.
var substitutions;
// On pattern detection function.
var onPatternDetectionFn;
// By default always ignore the new line character, else use the value supplied
// by `token2Ignore`; this will usually be the OOV lexeme, i.e. `$%^oov^%$`.
const toBeIgnoredToken = ( token2Ignore === undefined ) ? '\n' : token2Ignore;
// The `cache` is `undefined`, when things have to work on token text — for
// learning & recognition both. For native case of learning (i.e. generation),
// it can be `null` or real value; and native mode recognition will always
// need real value of the `cache`.
// Setup `keyLF/eosToken` to use during entity detection on the basis of `cache`
// value — It is critical for model generation.
const keyLF = ( cache === undefined || cache === null ) ? toBeIgnoredToken : cache.lookup( toBeIgnoredToken )[ 0 ];
const eosToken = ( cache === undefined || cache === null ) ? eosTokenX : eosTokenN;
// The `otherwise` event; including a space to ensure that such an input can
// never arrive from the token stream. Later on it will be changed to numeric
// value > `0xFFFFF` i.e. the limit of vocabulary.
const otherwise = ( cache === undefined ) ? otherwiseX : otherwiseN;
// ## getNextState
/**
*
* Returns the next state to be assigned i.e. the next unused state or
* a state corresponding to target, if defined.
*
* @param {number} index of current token.
* @param {number} last index of last token.
* @param {number} target state of the pattern being processed; could be
* `undefined` if it is being encountered for the first time.
* @returns {number} next state that should be assigned for the current event.
* @private
*/
var getNextState = function ( index, last, target ) {
// Check its invocation in the of fsm.
if ( index === last && target ) return target;
// Compute next unused state & return. Note this now becomes the last
// used state!
lastUsedState += 1;
return lastUsedState;
}; // getNextState()
// ## learnSinglePattern
/**
*
* Learns a single pattern.
*
* @param {string} name of the pattern to be learned.
* @param {array} pattern to be learned.
* @param {array} mark `[ start, end ]`.
* @param {any} customProperty contains definable value(s).
* @returns {undefined} Nothing!
* @private
*/
var learnSinglePattern = function ( name, pattern, mark, customProperty ) {
const length = pattern.length;
// Last element.
const last = length - 1;
// Target state for this pattern, would be `undefined` if this pattern type is
// enountered for the first time (`undefined` disables collapse of states).
const target = undefined;
// Tracks the `state` as the FSM builds up, specially useful if there are
// machines with shared path i.e. common `(state, events)` pairs.
let state = root;
// Assigned for `otherwise` events.
let goBackTo = root;
// Temp for event & next state.
let ev, nextState;
// Iterate through the pattern's tokens, while discovering any existing
// machine that can share path.
for ( let k = 0; k < length; k += 1 ) {
ev = pattern[ k ];
// Create new state & intialize, if required.
if ( fsm[ state ] === undefined ) {
fsm[ state ] = Object.create( null );
fsm[ state ][ otherwise ] = goBackTo;
}
// Check for machines that may share path.
if ( fsm[ state ][ ev ] === undefined ) {
// None found, create new state transition by assigning the next state for
// the current event – `ev`.
nextState = getNextState( k, last, target );
fsm[ state ][ ev ] = nextState;
// Always compute state transition from the perspective of discovering
// shared path: here the `fsm[ state ][ ev ]` has been just assigned
// `nextState`, therefore `state` needs to transition to this state only.
state = nextState;
} else if ( terminalStates[ fsm[ state ][ ev ] ] ) {
// Case when shared path is found and the next state on the path is a
// terminal state.
if ( fsm[ state ][ otherwise ] === root ) fsm[ state ][ otherwise ] = goBackTo;
goBackTo = fsm[ state ][ ev ];
nextState = getNextState( k, last, target );
fsm[ state ][ ev ] = nextState;
// Compute state transition; again like earlier case, it would be `nextState`.
state = nextState;
} else if ( k === last ) {
// Case when shared path is found and the next state on the path is NOT
// a terminal state AND current token is the LAST one.
nextState = getNextState( k, last, target );
fsm[ fsm[ state ][ ev ] ][ otherwise ] = nextState;
state = nextState;
} else {
// Case when shared path is found and the next state on the path is NOT
// a terminal state AND current token is NOT the LAST one.<br/>
// Simply compute state transition, no other work to be done!
state = fsm[ state ][ ev ];
}
}
terminalStates[ state ] = name;
if ( mark ) {
// Update last element of `mark` to simplifies computations during fsm
// execution. Update must happen as a deep copy & not directly!
markedStates[ state ] = identifyMarkedArea( mark, length );
}
if ( customProperty !== undefined ) {
customPropertyAtStates[ state ] = customProperty;
}
}; // learnSinglePattern()
// ## learn
/**
*
* Learns the patterns that must be detected via recognize() API calls.
*
* @param {Object[]} patterns to be learned.
*
* @param {string} patterns[].name of the pattern.
* @param {string} patterns[].structure of the pattern.
* @returns {number} of uniquely named patterns.
* `[ pattern-id, start-token, end-token ]` format.
* @private
*/
var learn = function ( patterns ) {
// Temp for counting unique.
var obj = Object.create( null );
// Composed Patterns
var cp = [];
for ( let i = 0; i < patterns.length; i += 1 ) {
const pi = patterns[ i ];
if ( typeof pi.pattern === 'string' ) {
const all = composePatterns( pi.pattern );
for ( let j = 0; j < all.length; j += 1 )
cp.push( { name: pi.name, pattern: all[ j ], mark: pi.mark, customProperty: pi.customProperty } );
} else cp.push( { name: pi.name, pattern: pi.pattern, mark: pi.mark, customProperty: pi.customProperty } );
}
// Sort to get the longest pattern on the top.
cp.sort( ( a, b ) => ( b.pattern.length - a.pattern.length ) );
// All set, now learn using composed patterns – `cp`!
for ( let i = 0; i < cp.length; i += 1 ) {
learnSinglePattern( cp[ i ].name, cp[ i ].pattern, cp[ i ].mark, cp[ i ].customProperty );
}
// Return number of uniquely named patterns.
for ( const ts in terminalStates ) obj[ terminalStates[ ts ] ] = true;
return ( ( Object.keys( obj ) ).length );
}; // learn()
// ## setOnPatternDetectionFn
/**
*
* Defines the function that is called on every detected pattern, provided
* the detected pattern had an `customProperty` property defined.
* @param {function} f to be called with `match` & `customProperty` value as parameters.
* @returns {boolean} `true` if it was a success otherwise `false`.
* @private
*/
var setOnPatternDetectionFn = function ( f ) {
if ( typeof f === 'function' ) {
onPatternDetectionFn = f;
return true;
}
return false;
}; // setOnPatternDetectionFn()
// ## pushMatch2Patterns
/**
*
* Pushes a `match`ed pattern details into the `patterns` array after handling
* marking and calling the on pattern detection function, if required. Before
* pushing a `match` to patterns, the state (numeric) at `match[ 2 ]` is mapped
* to its name using `terminalStates`; remember the `state` passed here is
* always the terminal state. Passing state in match ensures that respective
* `mark` and `customProperty` are handled differently if they have different values in
* a state-machine rows, even though the `names` are identical.
*
* @param {array} patterns where the `match` is pushed.
* @param {array} match pushed in to the `patterns`. The `match` conntains
* 3-entries viz. 0—state, 1 & 2—start & end indexes of `tokens`.
* @returns {undefined} Nothing.
* @private
*/
var pushMatch2Patterns = function ( patterns, match ) {
// Extract the state at match[ 0 ].
var m0 = match[ 2 ];
// Pattern name `'0'` — simply ignore it!
if ( terminalStates[ m0 ] === '0' ) return;
// Not to be ignored — process it.
var mark = markedStates[ m0 ];
var customProperty = customPropertyAtStates[ m0 ];
if ( mark ) {
match[ 0 ] += mark[ 0 ];
match[ 1 ] -= mark[ 1 ];
}
// Removed `customProperty !== undefined &&` check while coding pos experiment
if ( onPatternDetectionFn )
onPatternDetectionFn( match, customProperty );
match[ 2 ] = terminalStates[ m0 ];
patterns.push( match );
}; // pushPattern()
// ## setPatternSwap
/**
*
* Sets up the patterns to be used for token substitution/swap in the
* `recognize()` api.
*
* @param {array[]} patterns to be used for substitutions in `recognize()`.
* @returns {undefined} Nothing.
* @private
*/
var setPatternSwap = function ( patterns ) {
if ( !patterns || !Array.isArray( patterns ) ) {
substitutions = undefined;
return;
}
// Old `substitutions` are re-initialized.
substitutions = Object.create( null );
// Sort patterns by the start of pattern index.
patterns.sort( ( a, b ) => ( a[ 0 ] > b[ 0 ] ) );
// Index it by start of pattern.
patterns.forEach( ( e ) => ( substitutions[ e[ 0 ] ] = [ e[ 1 ], e[ 2 ] ] ) );
}; // setPatternSwap()
// ## recognize
/**
*
* Recognizes patterns present in the input tokens in a greedy manner.
*
* @param {array} tokens in which the patterns need to be recognized.
* @param {function} [transformToken] an optional function that is called before
* processing every token.
* @param {*} [param] that has to be passed as the last param to `transformToken()`
* function.
* @returns {array[]} where each element follows
* `[ pattern-id, start-token, end-token ]` format.
* @private
*/
var recognize = function ( tokens, transformToken, param ) {
// Length of the `tokens.`
const length = tokens.length;
// Check if `transformToken` is a valid function.
var transformTokenFn = ( typeof transformToken === 'function' ) ? transformToken : null;
// Detected patterns are captured here. Each element has the following format: <br/>
// `[ pattern-id, start-token, end-token ]`
var patterns = [];
// We don't need a separate state machines unlike `recognize()`, as the
// following set of variables together act like a singleton machine.
var first = 0;
var state = root;
// Next State.
var ns = root;
// Temp. for a single pattern.
var p = null;
// Last non-root otherwise state & index
var lastOtherwiseIndex;
var lastOtherwiseState;
// Temp. for a token.
var t;
// Used to increment `j` and computing span of pattern correctly, may become
// > 1 if an earlier detected pattern is longer that 1-token.
var delta = 1;
for ( let i = 0; i <= length; i += 1 ) {
// **Attempt greedy lookup**:<br/>
// Keep digging until next state becomes `root` or a terminal state is
// encountered. Upon failure after a partial match, roll back is required
// so that the extra consumed tokens can be explored by machine.
for ( let j = i; j <= length; j += delta ) {
// Extract current token.
t = ( j === length ) ? eosToken : tokens[ j ];
// Skip the newline character; TODO: will replace by the hash value!
// Use direct hash for the time being later, it must be obtained via cache
if ( t === keyLF ) continue; // eslint-disable-line no-continue
// Perform replacements using earlier detected patterns.
if ( substitutions && substitutions[ j ] ) {
t = substitutions[ j ][ 1 ];
delta = substitutions[ j ][ 0 ] - j + 1;
} else delta = 1;
// Apply token transformation function, if defined. Must not be called
// for the `eosToken`.
if ( transformTokenFn && ( j < length ) ) t = transformTokenFn( t, cache, param, j );
// Find next state on the basis of current `state` and current token – `t`.
ns = fsm[ state ][ t ] || root;
// Detect the state transition to capture `first` token of a potential upcoming
// pattern. If state is `root` and the next state is `non-root` indicates
// that we have just starting chasing for a new pattern.
if ( !state && ns ) first = j;
if ( terminalStates[ ns ] ) {
// Terminal state encountered, save this pattern. Update span using `delta`.
p = [ first, j + delta - 1, ns ];
pushMatch2Patterns( patterns, p );
// Set index to `j`, so that iterations can commence from `j + 1` as
// for-loop increments the index variable at the end of loop!
i = j;
// Ensures that the inner loop terminates!
j = length + 100;
// Pattern has been discovered, so next state must be set to `root`.
ns = root;
// Same is true for the last saved otherwise state.
lastOtherwiseState = root;
} else if ( ns === root ) {
// Not a terminal state but the next state has hit the `root`.
if ( lastOtherwiseState ) {
// But we have a `non-root` last saved otherwise state; this means
// we must save this pattern.
p = [ first, lastOtherwiseIndex, lastOtherwiseState ];
pushMatch2Patterns( patterns, p );
// Set index to the index corresponding to the above last saved otherwise
// state.
i = lastOtherwiseIndex;
// Ensure that the inner loop terminates;
j = length + 100;
// Pattern has been discovered, so next state must be set to `root`.
ns = root;
// Same is true for the last saved otherwise state.
lastOtherwiseState = root;
} else {
// The last saved otherwise state is pointing to `root`: terminate
// the inner loop without updating the index variable — this ensures
// complete roll back.
j = length + 100;
}
}
// Update the current state.
state = ns;
// Save (last) non-root otherwise state & index, if any.
if ( fsm[ state ][ otherwise ] ) {
// Update span using `delta`.
lastOtherwiseIndex = j + delta - 1;
lastOtherwiseState = fsm[ state ][ otherwise ];
}
}
}
return patterns;
}; // recognize()
// ## exportJSON
/**
* Exports the learning as a JSON, which may be saved as a text file for
* later use via `importJSON()`.
*
* @return {string} Learning in JSON format.
* @private
*/
var exportJSON = function () {
return JSON.stringify(
[ 100, lastUsedState, fsm, terminalStates, markedStates, customPropertyAtStates ]
);
}; // exportJSON()
// ## emptyModelJSON
/**
* Exports the an empty model's JSON. Useful in model generation.
*
* @return {string} Learning in JSON format.
* @private
*/
var emptyModelJSON = function () {
// Empty machine!
const m0 = Object.create( null );
m0[ 0 ] = Object.create( null );
return JSON.stringify(
[ 100,
0, // `lastUsedState`.
m0, // `fsm`,
Object.create( null ), // `terminalStates`,
Object.create( null ), // `markedStates`,
Object.create( null ), // `customPropertyAtStates`
]
);
}; // emptyModelJSON()
// ## importJSON
/**
* Imports an existing JSON learning for recognition.
*
* @param {JSON} json containing learnings in as exported by `exportJSON()`.
* @return {void} Nothing!
* @throws Error if `json` is invalid.
* @private
*/
var importJSON = function ( json ) {
var model = JSON.parse( json );
lastUsedState = model[ 1 ];
fsm = model[ 2 ];
terminalStates = model[ 3 ];
markedStates = model[ 4 ];
customPropertyAtStates = model[ 5 ];
}; // importJSON()
// Prints the model in terms of the state machine & terminal states.
var printModel = function () {
console.log( 'State Machine:' );
console.log( JSON.stringify( fsm, null, 2 ) );
console.log();
console.log( 'Terminal States:' );
console.log( JSON.stringify( terminalStates, null, 2 ) );
console.log();
console.log( 'Marked States:' );
console.log( JSON.stringify( markedStates, null, 2 ) );
console.log();
console.log( 'customProperty States:' );
console.log( JSON.stringify( customPropertyAtStates, null, 2 ) );
}; // printModel()
methods.learn = learn;
methods.recognize = recognize;
methods.setPatternSwap = setPatternSwap;
methods.setOnPatternDetectionFn = setOnPatternDetectionFn;
methods.exportJSON = exportJSON;
methods.importJSON = importJSON;
methods.emptyModelJSON = emptyModelJSON;
methods.printModel = printModel;
// This a dummy statement to ensure 100% coverage; because feature of
// collapsing shared states into single one was **disabled** due to `mark`.
getNextState( 0, 0, 99 );
return methods;
}; // fsm()
module.exports = simpleFSM;