wink-nlp
Version:
Developer friendly Natural Language Processing ✨
590 lines (521 loc) • 23.5 kB
JavaScript
// wink-nlp
//
// Copyright (C) GRAYPE Systems Private Limited
//
// This file is part of “wink-nlp”.
//
// Permission is hereby granted, free of charge, to any
// person obtaining a copy of this software and
// associated documentation files (the "Software"), to
// deal in the Software without restriction, including
// without limitation the rights to use, copy, modify,
// merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice
// shall be included in all copies or substantial
// portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
/* eslint-disable no-underscore-dangle */
/* eslint-disable no-console */
var containedEntities = require( './contained-entities.js' );
// ### Helper Functions
// Get **item at** collection, selection & parent.
var getParentItem = require( './api/get-parent-item.js' );
var colGetItemAt = require( './api/col-get-item.js' );
var selGetItemAt = require( './api/sel-get-item.js' );
// **Each** iterator for collection & selection.
var colEach = require( './api/col-each.js' );
var selEach = require( './api/sel-each.js' );
// **Filter** for collection & selection.
var colFilter = require( './api/col-filter.js' );
var selFilter = require( './api/sel-filter.js' );
// **Token's out** for item, collection & selection.
var itmTokenOut = require( './api/itm-token-out.js' );
var colTokensOut = require( './api/col-tokens-out.js' );
var selTokensOut = require( './api/sel-tokens-out.js' );
// **Entity's out** for item, collection & selection.
var itmEntityOut = require( './api/itm-entity-out.js' );
var colEntitiesOut = require( './api/col-entities-out.js' );
var selEntitiesOut = require( './api/sel-entities-out.js' );
// **Sentence's out** for item, collection & selection.
var itmSentenceOut = require( './api/itm-sentence-out.js' );
var colSentencesOut = require( './api/col-sentences-out.js' );
// **Document's out** for item.
var itmDocumentOut = require( './api/itm-document-out.js' );
// Print tokens, it is primarily for command line output.
var printTokens = require( './api/print-tokens.js' );
var its = require( './its.js' );
// <hr/>
// # Doc
/**
*
* The wink-nlp **doc**ument – constructed in `wink-nlp.js` – publishes the
* developer APIs.
*
* @param {object} docData It encapsulates the document data.
* @param {object} addons The model's addon, may contain word vectors, stemmer etc.
* @return {object} conatining APIs.
* @private
*/
var doc = function ( docData, addons ) {
// Extract `cache` as it is frequently accessed.
var cache = docData.cache;
// Document's tokens; each token is represented as an array of numbers:
// ```
// [
// hash, // of tokenized lexeme
// (nox) + preceding spaces, // expansion's normal
// pos + lemma, // pos & lemma are contextual
// entity + sentence // 12bit + 20bits
// ]
// ```
var tokens = docData.tokens;
// Entities — sorted as array of `[ start, end, entity type ].`
var entities = docData.entities;
var customEntities = docData.customEntities;
// Sentences — sorted as array of pairs of `[ start, end ]` pointing to the `tokens`.
var sentences = docData.sentences;
// Markings are 4-tuples of `start`, `end` **token indexes**, and `begin & end markers`.
// The begin & end markers are used to markup the tokens specified.
var markings = docData.markings;
// #### API core functions:
// Collection APIs.
var colEntities;
var colCustomEntities;
var colTokens;
var colSentences;
// Selection — obtained via `filter` — APIs. It is also like a collection.
var colSelectedEntities;
var colSelectedCustomEntities;
var colSelectedTokens;
// Item APIs.
var itemToken;
var itemEntity;
var itemCustomEntity;
var itemSentence;
// Vectors API
var contextualVectors;
// Others.
var isLexeme = cache.lookup;
// The Document — Returned!
var methods = Object.create( null );
// ## Token
// **Item, Collection, and Selection APIs.**
// ### itemToken
/**
*
* Makes item of the token specified at `index`.
*
* @param {number} index The index of the token, which is required to be returned as item token.
* @return {object} containing applicable API methods.
* @private
*/
itemToken = function ( index ) {
var api = Object.create( null );
// Access the parent document.
api.parentDocument = () => methods;
// Access the parent entity, **if any.**
api.parentEntity = () => getParentItem( index, entities, itemEntity );
// Access the parent cuustom entity, **if any.**
api.parentCustomEntity = () => getParentItem( index, customEntities, itemCustomEntity );
// Markup this token.
api.markup = ( beginMarker, endMarker ) => markings.push( [ index, index, beginMarker, endMarker ] );
// Output this token or its properties using mapper function — `f`.
api.out = ( f ) => itmTokenOut( index, docData, f, addons );
// Access the parent sentence.
api.parentSentence = () => getParentItem( index, sentences, itemSentence );
// Index within the document.
api.index = () => ( index );
return api;
}; // itemToken()
// ### colSelectedTokens
/**
*
* Makes collection of tokens identified by the `selectedTokens` array.
*
* @param {array} selectedTokens The array of selected tokens, using which the
* collection is made.
* @return {object} containing applicable API methods.
* @private
*/
colSelectedTokens = function ( selectedTokens ) {
var api = Object.create( null );
// Iterator.
api.each = ( f ) => selEach( f, selectedTokens, itemToken );
// Filter.
api.filter = ( f ) => selFilter( f, selectedTokens, itemToken, colSelectedTokens );
// Item at `k`th index. If `k` is outside valid range, return `undefined` like JS.
api.itemAt = ( k ) => selGetItemAt( k, selectedTokens, itemToken );
// Number of selected tokens.
api.length = () => ( selectedTokens.length );
// Output this collection of selected tokens as a reduced values or properties
// using map/reduce functions — `f/g`.
api.out = ( f, g ) => selTokensOut( selectedTokens, docData, f, g, addons );
return api;
}; // colTokens()
// ### colTokens
/**
*
* Makes collection of tokens beginning from `start` index to `end` index.
*
* @param {number} start The start index.
* @param {number} end The end index.
* @return {object} containing applicable API methods.
* @private
*/
colTokens = function ( start, end ) {
return (
function () {
var api = Object.create( null );
// Iterator.
api.each = ( f ) => colEach( f, start, end, itemToken );
// Filter.
api.filter = ( f ) => colFilter( f, start, end, itemToken, colSelectedTokens );
// Item at `k`th index. If `k` is outside valid range, return `undefined` like JS.
// No need to handle relative indexing as `colGetItemAt` handles it.
api.itemAt = ( k ) => colGetItemAt( k, start, end, itemToken );
// Length of this collection.
api.length = () => ( end - start + 1 );
// Output this token collection as a reduced values or properties using
// map/reduce functions — `f/g`.
api.out = ( f, g ) => colTokensOut( start, end, docData, f, g, addons );
return api;
}
);
}; // colTokens()
// <hr/>
// ## Entity
// **Item, Collection, and Selection APIs.**
// ### itemEntity
/**
*
* Makes item of the entity specified at `index`.
*
* @param {number} index The index of the entity, which is required to be
* returned as item entity.
* @return {object} containing applicable API methods.
* @private
*/
itemEntity = function ( index ) {
var api = Object.create( null );
// Access the parent document.
api.parentDocument = () => methods;
// Markup this entity.
api.markup = ( beginMarker, endMarker ) => markings.push( [ entities[ index ][ 0 ], entities[ index ][ 1 ], beginMarker, endMarker ] );
// Output this entity or its properties using mapper function — `f`.
api.out = ( f ) => itmEntityOut( index, entities, docData, f );
// Access the parent sentence.
api.parentSentence = () => getParentItem( entities[ index ][ 0 ], sentences, itemSentence );
// Retun collection of tokens contained in this entity.
api.tokens = colTokens( entities[ index ][ 0 ], entities[ index ][ 1 ] );
// Index within the document.
api.index = () => ( index );
return api;
}; // itemEntity()
// ### colSelectedEntities
/**
*
* Makes collection of entities identified by the `selectedEntities` array.
*
* @param {array} selectedEntities The array of selected entities, using which
* the collection is made.
* @return {object} containing applicable API methods.
* @private
*/
colSelectedEntities = function ( selectedEntities ) {
var api = Object.create( null );
// Iterator.
api.each = ( f ) => selEach( f, selectedEntities, itemEntity );
// Filter.
api.filter = ( f ) => selFilter( f, selectedEntities, itemEntity, colSelectedEntities );
// Item at `k`th index. If `k` is outside valid range, return `undefined` like JS.
api.itemAt = ( k ) => selGetItemAt( k, selectedEntities, itemEntity );
// Number of selected entities.
api.length = () => ( selectedEntities.length );
// Output this collectionn of selected of entities as a reduced value
// using map/reduce functions — `f/g`.
api.out = ( f, g ) => selEntitiesOut( selectedEntities, entities, docData, f, g );
return api;
}; // colSelectedEntities()
// ### colEntities
/**
*
* Makes collection of all the entities.
*
* @return {object} containing applicable API methods.
* @private
*/
colEntities = function () {
var api = Object.create( null );
// Iterator.
api.each = ( f ) => colEach( f, 0, entities.length - 1, itemEntity );
// Filter.
api.filter = ( f ) => colFilter( f, 0, entities.length - 1, itemEntity, colSelectedEntities );
// Item at `k`th index. If `k` is outside valid range, return `undefined` like JS.
api.itemAt = ( k ) => colGetItemAt( k, 0, ( entities.length - 1 ), itemEntity );
// Length of this collection.
api.length = () => ( entities.length );
// Output this collection of entities as a reduced value
// using map/reduce functions — `f/g`.
api.out = ( f, g ) => colEntitiesOut( entities, docData, f, g );
return api;
}; // colEntities()
// <hr/>
// ## Entity
// **Item, Collection, and Selection APIs.**
// ### itemCustomEntity
/**
*
* Makes item of the entity specified at `index`.
*
* @param {number} index The index of the entity, which is required to be
* returned as item entity.
* @return {object} containing applicable API methods.
* @private
*/
itemCustomEntity = function ( index ) {
var api = Object.create( null );
// Access the parent document.
api.parentDocument = () => methods;
// Markup this entity.
api.markup = ( beginMarker, endMarker ) => markings.push( [ customEntities[ index ][ 0 ], customEntities[ index ][ 1 ], beginMarker, endMarker ] );
// Output this entity or its properties using mapper function — `f`.
api.out = ( f ) => itmEntityOut( index, customEntities, docData, f );
// Access the parent sentence.
api.parentSentence = () => getParentItem( customEntities[ index ][ 0 ], sentences, itemSentence );
// Retun collection of tokens contained in this entity.
api.tokens = colTokens( customEntities[ index ][ 0 ], customEntities[ index ][ 1 ] );
// Index within the document.
api.index = () => ( index );
return api;
}; // itemCustomEntity()
// ### colSelectedCustomEntities
/**
*
* Makes collection of entities identified by the `selectedEntities` array.
*
* @param {array} selectedCustomEntities The array of selected entities, using which
* the collection is made.
* @return {object} containing applicable API methods.
* @private
*/
colSelectedCustomEntities = function ( selectedCustomEntities ) {
var api = Object.create( null );
// Iterator.
api.each = ( f ) => selEach( f, selectedCustomEntities, itemCustomEntity );
// Filter.
api.filter = ( f ) => selFilter( f, selectedCustomEntities, itemCustomEntity, colSelectedCustomEntities );
// Item at `k`th index. If `k` is outside valid range, return `undefined` like JS.
api.itemAt = ( k ) => selGetItemAt( k, selectedCustomEntities, itemCustomEntity );
// Number of selected entities.
api.length = () => ( selectedCustomEntities.length );
// Output this collectionn of selected of entities as a reduced value
// using map/reduce functions — `f/g`.
api.out = ( f, g ) => selEntitiesOut( selectedCustomEntities, customEntities, docData, f, g );
return api;
}; // colSelectedCustomEntities()
// ### colCustomEntities
/**
*
* Makes collection of all the entities.
*
* @return {object} containing applicable API methods.
* @private
*/
colCustomEntities = function () {
var api = Object.create( null );
// Iterator.
api.each = ( f ) => colEach( f, 0, customEntities.length - 1, itemCustomEntity );
// Filter.
api.filter = ( f ) => colFilter( f, 0, customEntities.length - 1, itemCustomEntity, colSelectedCustomEntities );
// Item at `k`th index. If `k` is outside valid range, return `undefined` like JS.
api.itemAt = ( k ) => colGetItemAt( k, 0, ( customEntities.length - 1 ), itemCustomEntity );
// Length of this collection.
api.length = () => ( customEntities.length );
// Output this collection of entities as a reduced value
// using map/reduce functions — `f/g`.
api.out = ( f, g ) => colEntitiesOut( customEntities, docData, f, g );
return api;
}; // colCustomEntities()
// <hr/>
// ## Sentence
// **Item, Collection, and Selection APIs.**
// ### itemSentence
/**
*
* Makes item of the sentence specified by `index` of the sentence.
*
* @param {number} index The index of the sentence.
* @return {object} containing applicable API methods.
* @private
*/
itemSentence = function ( index ) {
var api = Object.create( null );
// Access the parent document.
api.parentDocument = () => methods;
// Markup this sentence.
api.markup = ( beginMarker, endMarker ) => markings.push( [ sentences[ index ][ 0 ], sentences[ index ][ 1 ], beginMarker, endMarker ] );
// Output this sentence as text.
api.out = ( f ) => itmSentenceOut( index, docData, f, addons );
// Outputs the collection of entities, if any, contained in this sentence.
api.entities = () => colSelectedEntities( containedEntities( entities, sentences[ index ][ 0 ], sentences[ index ][ 1 ] ) );
// Outputs the collection of custom entities, if any, contained in this sentence.
api.customEntities = () => colSelectedCustomEntities( containedEntities( customEntities, sentences[ index ][ 0 ], sentences[ index ][ 1 ] ) );
// Outputs the collection of tokens in this sentence.
api.tokens = colTokens( sentences[ index ][ 0 ], sentences[ index ][ 1 ] );
// Index within the document.
api.index = () => ( index );
return api;
}; // itemSentence()
// ### colSentences
/**
*
* Makes collection of sentences in this document.
*
* @return {object} containing applicable API methods.
* @private
*/
colSentences = function () {
var api = Object.create( null );
// Iterator.
api.each = ( f ) => colEach( f, 0, sentences.length - 1, itemSentence );
// Item at `k`th index. If `k` is outside valid range, return `undefined` like JS.
api.itemAt = ( k ) => colGetItemAt( k, 0, ( sentences.length - 1 ), itemSentence );
// Length of this collection.
api.length = () => ( sentences.length );
// Output this collection of sentences as an array of strings.
api.out = ( f ) => colSentencesOut( docData, f, addons );
return api;
}; // colSentences()
// <hr/>
// ### contextualVectors
/**
*
* Makes a JSON of contextually relevant words in the winkNLP format.
*
* @return {string} containing the JSON.
*/
// eslint-disable-next-line complexity
contextualVectors = function ( { lemma = true, specificWordVectors = [], similarWordVectors = false, wordVectorsLimit = 0 } = {} ) {
// Error handling!
if ( docData.wordVectors === null )
throw Error( 'wink-nlp: word vectors are not loaded: load them winkNLP\'s instantiation time.' );
if ( !Array.isArray( specificWordVectors ) )
throw Error( `wink-nlp: expecting a valid Javascript array for similarWordVectos, instead found "${typeof specificWordVectors}".`);
if ( !Number.isInteger( wordVectorsLimit ) || wordVectorsLimit >= docData.wordVectors.size )
throw Error( 'wink-nlp: invalid value or type encountered for wordVectorsLimit.' );
if ( lemma && !docData.currPipe.pos )
throw Error( 'wink-nlp: Can\'t create lemma vectors without pos: add a "pos" to NLP pipe.' );
// Initialize contextual vectors.
const cv = Object.create( null );
// Following properties are constants, therefore can be directly copied.
cv.precision = docData.wordVectors.precision;
cv.l2NormIndex = docData.wordVectors.l2NormIndex;
cv.wordIndex = docData.wordVectors.wordIndex;
cv.dimensions = docData.wordVectors.dimensions;
cv.unkVector = docData.wordVectors.unkVector.slice( 0 );
// Following properties will be determined on the basis of the context.
cv.size = 0;
cv.words = [];
cv.vectors = Object.create( null );
// Shortcut all word vectors.
const awvs = docData.wordVectors.vectors;
// Extract all document's tokens.
const docTokens = colTokens( 0, docData.numOfTokens - 1 )()
.out()
.map( ( t ) => t.toLowerCase() );
let docTokensLemma = [];
if ( lemma ) docTokensLemma = colTokens( 0, docData.numOfTokens - 1 )()
.out( its.lemma )
.map( ( t ) => t.toLowerCase() );
// NOTE: For UNK words an all zero vector is set up, with `l2Norm = 0`, which may be used in as.vector helper
// to detect an UNK word.
for ( let i = 0; i < docTokens.length; i += 1 ) cv.vectors[ docTokens[ i ] ] = ( awvs[ docTokens[ i ] ] || cv.unkVector ).slice( 0 );
for ( let i = 0; i < docTokensLemma.length; i += 1 ) cv.vectors[ docTokensLemma[ i ] ] = ( awvs[ docTokensLemma[ i ] ] || cv.unkVector ).slice( 0 );
for ( let i = 0; i < specificWordVectors.length; i += 1 ) {
const spWord = ( specificWordVectors[ i ] ) ? specificWordVectors[ i ].toString().trim() : false;
if ( spWord )
cv.vectors[ specificWordVectors[ i ] ] = ( awvs[ specificWordVectors[ i ] ] || cv.unkVector ).slice( 0 );
}
if ( similarWordVectors ) {
// Extract similar words on the basis of shortest Manhattan distance.
const allUniqueTokens = Object.keys( cv.vectors );
// Set up similar words array, with the size of all unique tokens.
const similarWords = new Array( allUniqueTokens.length );
// Placeholder for maintaining the similarity score based on Manhattan distance.
const similarWordsScore = new Array( allUniqueTokens.length );
// Initialize to a large distance!
similarWordsScore.fill( 1000000 );
// Initialize contextual vectors size i.e. vocab.
cv.size = allUniqueTokens.length;
// Now search each one of them in the entire word vectors space.
// Keep updating the smallest distance.
for ( let i = 0; i < allUniqueTokens.length; i += 1 ) {
const cwv = cv.vectors[ allUniqueTokens[ i ] ];
for ( const word in awvs ) { // eslint-disable-line guard-for-in
if ( word === allUniqueTokens[ i ] ) continue; // eslint-disable-line no-continue
const wv = awvs[ word ];
let distance = 0;
for ( let k = 0; k < cv.dimensions && distance < similarWordsScore[ i ]; k += 1 ) {
distance += Math.abs( cwv[ k ] - wv[ k ] );
} // Mahattan distance computation loop.
if ( distance < similarWordsScore[ i ] ) {
similarWordsScore[ i ] = distance;
similarWords[ i ] = word;
}
} // Traversing all the word vectors.
} // Traversing all the tokens in the corpus.
// Update contextual vectors using the list of similar words; also update their size.
for ( let i = 0; i < similarWords.length; i += 1 ) {
if ( cv.vectors[ similarWords[ i ] ] === undefined ) {
// Similar word must exist in `awvs`.
cv.vectors[ similarWords[ i ] ] = awvs[ similarWords[ i ] ].slice( 0 );
cv.size += 1;
}
}
} else cv.size = Object.keys( cv.vectors ).length;
// Fill the balance space, if any, on the basis of wordVectorsLimit.
for ( let i = 0; cv.size < wordVectorsLimit; i += 1 ) {
const word = docData.wordVectors.words[ i ];
if ( !cv.vectors[ word ] ) {
cv.vectors[ word ] = awvs[ word ].slice( 0 );
cv.size += 1;
}
}
// Sort words on the basis of their usage frequency.
cv.words = Object.keys( cv.vectors )
.map( ( w ) => ( { w: w, i: (cv.vectors[ w ][ cv.wordIndex ] < 0 ) ? Infinity : cv.vectors[ w ][ cv.wordIndex ] } ) )
.sort( (a, b) => a.i - b.i )
.map( ( o ) => o.w );
// Update the word index entry inside every vector.
for ( let i = 0; i < cv.size; i += 1 ) cv.vectors[ cv.words[ i ] ][ cv.wordIndex ] = i;
return JSON.stringify( cv );
}; // contextualVectors()
// Published chainable methods.
methods.entities = colEntities;
methods.customEntities = colCustomEntities;
methods.isLexeme = isLexeme;
methods.isOOV = cache.isOOV;
methods.out = ( f ) => itmDocumentOut( docData, f, addons );
methods.sentences = colSentences;
methods.tokens = colTokens( 0, docData.numOfTokens - 1 );
methods.printTokens = () => printTokens( tokens, cache );
// Enusre that we make a deep copy of config before returning to avoid corruption!
methods.pipeConfig = () => JSON.parse( JSON.stringify( docData.currPipe ) );
methods.contextualVectors = contextualVectors;
return methods;
};
module.exports = doc;