UNPKG

wink-nlp

Version:

Developer friendly Natural Language Processing ✨

485 lines (429 loc) 17.8 kB
// wink-nlp // // Copyright (C) GRAYPE Systems Private Limited // // This file is part of “wink-nlp”. // // Permission is hereby granted, free of charge, to any // person obtaining a copy of this software and // associated documentation files (the "Software"), to // deal in the Software without restriction, including // without limitation the rights to use, copy, modify, // merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to // whom the Software is furnished to do so, subject to // the following conditions: // // The above copyright notice and this permission notice // shall be included in all copies or substantial // portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED // TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A // PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF // CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. // var DocDataWrapper = require( './dd-wrapper.js' ); var Doc = require( './doc-v2.js' ); var Cache = require( './cache.js' ); var tokenizer = require( './tokenizer.js' ); var compileTRex = require( './compile-trex.js' ); var mappers = require( './tokens-mappers.js' ); var itsHelpers = require( './its.js' ); var asHelpers = require( './as.js' ); var mapRawTokens2UIdOfNormal = mappers.mapRawTokens2UIdOfNormal; var mapRawTokens2UIdOfDefaultPOS = mappers.mapRawTokens2UIdOfDefaultPOS; var Compiler = require( './examples-compiler.js' ); var constants = require( './constants.js' ); var fsm = require( './automaton.js' ); var search = require( './search.js' ); var locate = require( './locate.js' ); var helper = require( './helper.js' ); // Size of a single token. var tkSize = constants.tkSize; /** * Creates an instance of nlp. * @private * * @param {object} theModel language model. * @param {string[]} pipe of nlp annotations. * @param {object} wordEmbeddings object read using node require. * @returns {object} conatining set of API methods for natural language processing. * @example * const nlp = require( 'wink-nlp' ); * var myNLP = nlp(); */ var nlp = function ( theModel, pipe = null, wordEmbeddings = null ) { var methods = Object.create( null ); // Token Regex; compiled from `model` var trex; // wink-nlp language `model`. var model; // Holds instance of `cache` created using the `model`. var cache; // NLP Pipe Config. // var nlpPipe = Object.create( null ); // Configured tokenize. var tokenize; // Automata // 1. NER var nerAutomata; var nerTransformers; // 2. SBD var sbdAutomata; var sbdTransformers; var sbdSetter; // 3. NEG var negAutomata; var negSetter; // SA var saAutomata; var saSetter; // POS var posAutomata; var posTransformers; var posSetter; var posUpdater; // Patterns or Custom Entities var cerAutomata; var cerTransformer; var cerLearnings = 0; var cerPreserve; var cerConfig; // Used for compiling examples. var compiler; // Used to innstantiate the compiler. var cerMetaModel; // Contains a list of valid annotations built from `theModel`. var validAnnotations = Object.create( null ); // Current pipe. var currPipe = Object.create( null ); var onlyTokenization = true; // Private methods. // ## load /** * Loads the model containing the core model along with other applicable * models. * @private * * @returns {void} nothing!. * @private */ var load = function () { // Load language model. model = theModel.core(); // With `intrinsicSize` captured, instantiate cache etc. cache = Cache( model, theModel.featureFn ); // eslint-disable-line new-cap trex = compileTRex( model.trex ); // Instantiate tokenizer. tokenize = tokenizer( trex, model.tcat.hash, model.preserve ); // Load & setup SBD model. var sbdModel = theModel.sbd(); sbdAutomata = new Array( sbdModel.machines.length ); sbdTransformers = new Array( sbdModel.machines.length ); for ( let i = 0; i < sbdModel.machines.length; i += 1 ) { sbdAutomata[ i ] = fsm( cache ); sbdAutomata[ i ].importJSON( sbdModel.machines[ i ] ); sbdTransformers[ i ] = sbdModel.transformers[ i ]; } sbdSetter = sbdModel.setter; // Load & setup NER model. var nerModel = theModel.ner(); nerAutomata = new Array( nerModel.machines.length ); nerTransformers = new Array( nerModel.machines.length ); for ( let i = 0; i < nerModel.machines.length; i += 1 ) { nerAutomata[ i ] = fsm( cache ); nerAutomata[ i ].importJSON( nerModel.machines[ i ] ); nerTransformers[ i ] = nerModel.transformers[ i ]; } var negModel = theModel.negation(); negAutomata = fsm( cache ); negAutomata.importJSON( negModel.machines[ 0 ] ); negSetter = negModel.setter; var saModel = theModel.sa(); saAutomata = fsm( cache ); saAutomata.importJSON( saModel.machines[ 0 ] ); saSetter = saModel.setter; var posModel = theModel.pos(); posAutomata = new Array( posModel.machines.length ); posTransformers = new Array( nerModel.machines.length ); for ( let i = 0; i < posModel.machines.length; i += 1 ) { // Ignore only OOV literal and not new line character in the case of POS Tagging. posAutomata[ i ] = fsm( cache, cache.value( 0 ) ); posAutomata[ i ].importJSON( posModel.machines[ i ] ); posTransformers[ i ] = posModel.transformers[ i ]; } posSetter = posModel.setter; posUpdater = posModel.updater; var cmModel = theModel.metaCER(); cerMetaModel = cmModel.machines; cerTransformer = cmModel.transformers[ 0 ]; // posAutomata = fsm( cache, cache.value( 0 ) ); // posAutomata.importJSON( posModel.machines[ 0 ] ); // posTransformer = posModel.transformers[ 0 ]; }; // load() // Public Methods. // ## readDoc /** * Loads a single document to be processed. * @private * * @param {string} text of the document that you want to process. * @returns {object} the document in terms of an object that exposes the API. * @example * const DOC = "The quick brown fox jumps over the lazy dog"; * myNLP.readDoc(DOC); */ var readDoc = function ( text ) { if ( typeof text !== 'string' ) { throw Error( `wink-nlp: expecting a valid Javascript string, instead found "${typeof text}".`); } // Raw Document Data-structure gets populated here as NLP pipe taks execute! var rdd = Object.create( null ); // The `cache` is also part of document data structure. rdd.cache = cache; // Each document gets a pointer to the word vectors. rdd.wordVectors = wordEmbeddings; // Document's tokens; each token is represented as an array of numbers: // ``` // [ // hash, // of tokenized lexeme // (nox) + preceding spaces, // expansion's normal // pos + lemma, // pos & lemma are contextual // negation flag // 1 bit at msb // ] // ``` rdd.tokens = []; // Sentences — stored as array of pairs of `[ start, end ]` pointing to the `tokens`. rdd.sentences = []; // Markings are 4-tuples of `start`, `end` **token indexes**, and `begin & end markers`. // The begin & end markers are used to markup the tokens specified. rdd.markings = []; // Publish the current annotation pipeline so that code can inquire about // active annotations! rdd.currPipe = currPipe; // Set storage for non braking spaces rdd.nonBreakingSpaces = Object.create( null ); var wrappedDocData = DocDataWrapper( rdd ); // eslint-disable-line new-cap // Start of NLP Pipe tokenize( wrappedDocData, text ); // eslint-disable-line new-cap // Compute number of tokens. rdd.numOfTokens = rdd.tokens.length / tkSize; // This structure is identical to sentences ( or entities ), for the sake of uniformity. // The structure is `[ start, end, negationFlag, sentimentScore ]`. rdd.document = [ 0, ( rdd.numOfTokens - 1 ), 0, 0 ]; // Map tokens for automata if there are other annotations to be performed. var tokens4Automata = ( onlyTokenization ) ? null : mapRawTokens2UIdOfNormal( rdd ); var px; if ( currPipe.sbd ) { // Sentence Boundary Detection. // Set first `Pattern Swap (x)` as `null`. px = null; for ( let i = 0; i < sbdAutomata.length; i += 1 ) { sbdAutomata[ i ].setPatternSwap( px ); // For SBD, all tokens are required to extract preceeding spaces. px = sbdAutomata[ i ].recognize( tokens4Automata, sbdTransformers[ i ], rdd.tokens ); } // The structure of sentence is:<br/> // `[ start, end, negationFlag, sentimentScore ]` sbdSetter( px, rdd ); // Compute number of sentences! rdd.numOfSentences = rdd.sentences.length; } else { // Setup default sentence as entire document! rdd.numOfSentences = 1; rdd.sentences = [ [ 0, ( rdd.numOfTokens - 1 ), 0, 0 ] ]; } if ( currPipe.ner ) { // Named entity detection. px = null; for ( let i = 0; i < nerAutomata.length; i += 1 ) { nerAutomata[ i ].setPatternSwap( px ); px = nerAutomata[ i ].recognize( tokens4Automata, nerTransformers[ i ] ); } // Entities — storted as array of `[ start, end, entity type ].` // There are no setter for entities as no transformation is needed. rdd.entities = px; } else { rdd.entities = []; } if ( currPipe.negation ) { // Negation px = null; px = negAutomata.recognize( tokens4Automata ); negSetter( px, rdd, constants, search ); } if ( currPipe.sentiment ) { // Sentiment Analysis px = null; px = saAutomata.recognize( tokens4Automata ); saSetter( px, rdd, constants, locate ); } if ( currPipe.pos ) { // PoS Tagging const posTags = mapRawTokens2UIdOfDefaultPOS( rdd ); px = null; for ( let i = 0; i < posAutomata.length; i += 1 ) { px = posAutomata[ i ].recognize( posTags, posTransformers[ 0 ], rdd.tokens ); posUpdater( px, cache, posTags, tokens4Automata ); } posSetter( rdd, posTags, tkSize, constants.bits4lemma ); } if ( currPipe.cer ) { // Patterns px = null; if ( cerAutomata !== undefined && cerLearnings > 0 ) { cerConfig.rdd = rdd; cerConfig.preserve = cerPreserve; cerConfig.constants = constants; if ( cerConfig.useEntity ) cerAutomata.setPatternSwap( rdd.entities ); px = cerAutomata.recognize( tokens4Automata, cerTransformer, cerConfig ); } // If there are no custom entities, then `px` will be `null`; in such a case // set `customEntities` to an empty array. rdd.customEntities = px || []; } else rdd.customEntities = []; // Word Vector // if ( theModel.wordVectors !== undefined ) { // // } // Now create the document! var doc = Doc( rdd, theModel.addons ); // eslint-disable-line new-cap // All done — cleanup document's data. wrappedDocData.clean(); return doc; }; // readDoc() var learnCustomEntities = function ( examples, config ) { // Ensure (a) `examples` is an array and (b) and its each element is an object. if ( helper.isArray( examples ) ) { examples.forEach( ( ex ) => { if ( helper.isObject( ex ) ) { // The object must contain name & patterns property of string and array type respectively. if ( ( typeof ex.name !== 'string' ) || ( ex.name === '' ) ) { throw Error( `wink-nlp: name should be a string, instead found "${ex.name}":\n\n${JSON.stringify( ex, null, 2 )}` ); } else if ( helper.isArray( ex.patterns ) ) { for ( let k = 0; k < ex.patterns.length; k += 1 ) { const p = ex.patterns[ k ]; // Each pattern should be a string. if ( ( typeof p !== 'string' ) || ( p === '' ) ) { throw Error( `wink-nlp: each pattern should be a string, instead found "${p}":\n\n${JSON.stringify( ex, null, 2 )}` ); } } // for ( let k = 0;... ) } else { // Pattern is not an array. throw Error( `wink-nlp: patterns should be an array, instead found "${typeof ex.patterns}":\n\n${JSON.stringify( ex, null, 2 )}` ); } // If mark is present then it should be an array of integers **and** its length must // be equal to 2 **and** start index <= end index. if ( ( ex.mark !== undefined ) && ( !helper.isIntegerArray( ex.mark ) || ( ex.mark.length !== 2 ) || ( ex.mark.length === 2 && ex.mark[ 0 ] > ex.mark[ 1 ] ) ) ) { throw Error( `wink-nlp: mark should be an array containing start & end indexes, instead found:\n\n${JSON.stringify( ex.mark, null, 2 )}` ); } } else { // Example is not an object. throw Error( `wink-nlp: each example should be an object, instead found a "${typeof ex}":\n\n${JSON.stringify( ex, null, 2 )}` ); } } ); } else { // Examples is not an array. throw Error( `wink-nlp: examples should be an array, instead found "${typeof examples}".` ); } // Validate config cerConfig = ( config === undefined || config === null ) ? Object.create( null ) : JSON.parse( JSON.stringify( config ) ); if ( !helper.isObject( cerConfig ) ) { throw Error( `wink-nlp: config should be an object, instead found "${typeof cerConfig}".` ); } cerConfig.matchValue = !!cerConfig.matchValue; cerConfig.usePOS = ( cerConfig.usePOS === undefined ) ? true : !!cerConfig.usePOS; cerConfig.useEntity = ( cerConfig.useEntity === undefined ) ? true : !!cerConfig.useEntity; // Instantiate compiler. compiler = Compiler( cerMetaModel, cache, tokenize, cerConfig.matchValue ); // eslint-disable-line new-cap cerAutomata = null; cerLearnings = 0; cerAutomata = fsm(); const compiled = compiler.run( examples ); cerPreserve = compiled.preserve; cerLearnings = cerAutomata.learn( compiled.examples ); // cerAutomata.printModel(); return cerLearnings; }; // learnCustomEntities() if ( helper.isObject( theModel ) ) { if ( typeof theModel.core !== 'function' ) { throw Error( 'wink-nlp: invalid model used.' ); } } else { throw Error( 'wink-nlp: invalid model used.' ); } // Build a list of valid annotations from `theModel`. This will ensure that // only **available** annotations from the model can be used in the pipe. validAnnotations.sbd = typeof theModel.sbd === 'function'; validAnnotations.negation = typeof theModel.negation === 'function'; validAnnotations.sentiment = typeof theModel.sa === 'function'; validAnnotations.pos = typeof theModel.pos === 'function'; validAnnotations.ner = typeof theModel.ner === 'function'; validAnnotations.cer = typeof theModel.metaCER === 'function'; if ( wordEmbeddings !== null ) { if ( !helper.isObject( wordEmbeddings ) ) throw Error( `wink-nlp: invalid word vectors, it must be an object instead found a "${typeof wordEmbeddings}".` ); let numOfKeys = 0; const wordVectorKeys = Object.create( null ); wordVectorKeys.precision = true; wordVectorKeys.l2NormIndex = true; wordVectorKeys.wordIndex = true; wordVectorKeys.dimensions = true; wordVectorKeys.unkVector = true; wordVectorKeys.size = true; wordVectorKeys.words = true; wordVectorKeys.vectors = true; for ( const key in wordEmbeddings ) { // eslint-disable-line guard-for-in numOfKeys += 1; if ( !wordVectorKeys[ key ] ) throw Error( 'wink-nlp: invalid word vectors format.' ); } if ( numOfKeys === 0 ) throw Error( 'wink-nlp: empty word vectors found.' ); } const tempPipe = ( pipe === null || pipe === undefined ) ? Object.keys( validAnnotations ) : pipe; if ( helper.isArray( tempPipe ) ) { tempPipe.forEach( ( at ) => { if ( !validAnnotations[ at ] ) throw Error( `wink-nlp: invalid pipe annotation "${at}" found.` ); currPipe[ at ] = true; onlyTokenization = false; } ); } else throw Error( `wink-nlp: invalid pipe, it must be an array instead found a "${typeof pipe}".` ); // Load the model. load(); // Setup default configuration. // definePipeConfig(); // Methods. methods.readDoc = readDoc; methods.learnCustomEntities = learnCustomEntities; // Expose `its` and `as` helpers. methods.its = itsHelpers; methods.as = asHelpers; // Vector of a token method. methods.vectorOf = function ( word, safe = true ) { if ( !wordEmbeddings ) throw Error( 'wink-nlp: word vectors are not loaded, use const nlp = winkNLP( model, pipe, wordVectors ) to load.' ); const vectors = wordEmbeddings.vectors; const unkVector = wordEmbeddings.unkVector; const sliceUpTo = wordEmbeddings.l2NormIndex + 1; if ( typeof word !== 'string' ) { throw Error( 'winkNLP: input word must be of type string.' ); } const tv = vectors[ word.toLowerCase() ]; if ( tv === undefined ) { // If unsafe, return the entire array. return ( safe ) ? unkVector.slice( 0, sliceUpTo ) : unkVector.slice(); } return ( safe ) ? tv.slice( 0, sliceUpTo ) : tv.slice(); }; // vectorOf() return methods; }; // wink module.exports = nlp;