wink-nlp
Version:
Developer friendly Natural Language Processing ✨
355 lines (327 loc) • 12.7 kB
JavaScript
// wink-nlp
//
// Copyright (C) GRAYPE Systems Private Limited
//
// This file is part of “wink-nlp”.
//
// Permission is hereby granted, free of charge, to any
// person obtaining a copy of this software and
// associated documentation files (the "Software"), to
// deal in the Software without restriction, including
// without limitation the rights to use, copy, modify,
// merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice
// shall be included in all copies or substantial
// portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
/* eslint-disable no-underscore-dangle */
// Used in accessing the regex and its category from `rgxs`.
const RGX = 0;
const CAT = 1;
// SPECIAL REGULAR EXPRESSIONS:
// Regex to handle short forms or abbreviations.
var rgxShortFormDot = /^(?:(?:[A-Z])(?:\.))+$/i;
var rgxShortForm = /^(?:(?:[A-Z])(?:\.))+[a-z]?$/i;
// Regex process hyphenated words.
var rgxHyphens = /[\-\–\—]/gi;
var rgxPeriod = /[\.]/gi;
var rgxNumber = /[0-9]/;
// ### tokenizer
/**
*
* Creates an instance of `tokenizer`.
*
* @param {object} categories token categories, as obtained via the language model.
* @param {object} preserve rules for hyphenation preservation.
* @return {function} for recursive tokenization.
* @private
*/
var tokenizer = function ( categories, preserve ) {
// Function to add tokens to the `doc()`.
var addToken;
var addTokenIfInCache;
// Function to test if lexeme exists via `doc()`.
var isLexeme;
// Preceding Spaces — special need for recursive tokenizer.
var ps = 0;
// Will only be needed for the first token, after that it si all zero (ps)!
var nonBreakingSpaces = null;
// ### pushHyphenatedToken
/**
*
* Pushes the incoming `tkn` after handling hyphens, if any:
* 1. Use it as-is if it is a valid lexeme or contains a number.
* 2. Use it as-is if does not contain hyphens.
* 3. Otherwise apply rules.
*
* @param {string} tkn to be processed as per rules hyphenation rules in `preserve`.
* @param {array} tokens into which the (split) `tkn` is pushed.
* @returns {void} nothing!
* @private
*/
var pushHyphenatedToken = function ( tkn, tokens ) {
// Will contain pure alpha words, obtained by splitting on `rgxHyphens`.
var words;
// Will contain mathed hyphens.
var hyphens;
// Helper variables.
var i, k, last;
// If a token is a valid lexeme or contains one or more number, dont touch it.
if ( isLexeme( tkn) || rgxNumber.test( tkn ) ) {
tokens.push( [ tkn, categories.word ] );
return;
}
hyphens = tkn.match( rgxHyphens );
// If there are no hyphens in the word, dont touch it.
if ( hyphens === null ) {
tokens.push( [ tkn, categories.word ] );
return;
}
// Word is hyphenated, process it according to the rules specified in `preserve`.
words = tkn.split( rgxHyphens );
last = words.length - 1;
if ( preserve.prefix[ words[ 0 ] ] || preserve.suffix[ words[ last ] ] ) {
tokens.push( [ tkn, categories.word ] );
return;
}
k = 0;
for ( i = 0; i < words.length; i += 1 ) {
// Do not push any empty token!
if ( words[ i ] !== '' ) {
tokens.push( [ words[ i ], categories.word ] );
}
if ( k < hyphens.length ) {
tokens.push( [ hyphens[ k ], categories.punctuation ] );
}
k += 1;
}
}; // pushHyphenatedToken()
// ### pushWordToken
/**
*
* Pushes the incoming `tkn` after handling periods and hyphens present:
* 1. Use it as-is if it is a valid lexeme or a short form ending with a period.
* 2. Split on period and the successively assemble tokens using matches & splits.
* 3. Finally send each such assembled token down for handling hyphenated word.
*
* @param {string} tkn to be processed and pushed.
* @param {array} tokens into which the (split) `tkn` is pushed.
* @returns {void} nothing!
* @private
*/
var pushWordToken = function ( tkn, tokens ) {
// Will contain words, obtained by splitting on `rgxPeriod`.
var words;
// Will contain matched periods.
var periods;
// Helper variables:<br/>
// Index variables
var i, k;
// Used in successively assembling a potential token from matches & words
// (i.e. splits), if word has periods.
var currBuild = '';
var nextBuild = '';
// If a token is a **valid lexeme**, or it is **short form ending with a
// period** (e.g. dot) then _dont touch it._
if ( isLexeme( tkn ) || rgxShortFormDot.test( tkn ) ) {
tokens.push( [ tkn, categories.word ] );
return;
}
// Start by matching with periods
periods = tkn.match( rgxPeriod );
// If there are no periods in the word, dont touch it.
if ( periods === null ) {
pushHyphenatedToken( tkn, tokens );
return;
}
// Word has periods, therefore process it:
words = tkn.split( rgxPeriod );
k = 0;
for ( i = 0; i < words.length; i += 1 ) {
// Build next potential token by joining the current build with the next word.
nextBuild = currBuild + words[ i ];
// If it is a valid possibility, then continue building it.
if ( rgxShortForm.test( nextBuild ) || ( isLexeme( nextBuild ) && nextBuild.length > 2 ) || ( currBuild === '' ) ) {
currBuild = nextBuild;
} else {
// Else send it down to handle hyphenated word.
pushHyphenatedToken( currBuild, tokens );
// Reset builds.
currBuild = words[ i ];
nextBuild = '';
}
if ( k < periods.length ) {
// In the same manner handle period sign.
nextBuild = currBuild + periods[ k ];
if ( rgxShortForm.test( nextBuild ) || ( isLexeme( nextBuild ) && nextBuild.length > 2 ) ) {
currBuild = nextBuild;
} else {
pushHyphenatedToken( currBuild, tokens );
tokens.push( [ periods[ k ], categories.punctuation ] );
currBuild = '';
nextBuild = '';
}
}
k += 1;
}
// Handle the last piece if applicable.
if ( currBuild !== '' ) pushHyphenatedToken( currBuild, tokens );
}; // pushWordToken()
// ### tokenizeTextUnit
/**
*
* Attempts to tokenize the input `text` using the `rgxSplit`. The tokenization
* is carried out by combining the regex matches and splits in the right sequence.
* The matches are the *real tokens*, whereas splits are text units that are
* tokenized in later rounds! The real tokens (i.e. matches) are pushed as
* `object` and splits as `string`.
*
* @param {string} text unit that is to be tokenized.
* @param {object} rgxSplit object containing the regex and it's category.
* @return {array} of tokens.
* @private
*/
var tokenizeTextUnit = function ( text, rgxSplit ) {
// Regex matches go here; note each match is a token and has the same tag
// as of regex's category.
var matches = text.match( rgxSplit[ RGX ] );
// Balance is "what needs to be tokenized".
var balance = text.split( rgxSplit[ RGX ] );
// The result, in form of combination of tokens & matches, is captured here.
var tokens = [];
// The tag;
var tag = rgxSplit[ CAT ];
// Helper variables.
var i,
imax,
k,
t; // Temp token.
// tp; // Temp token with a period sign in end.
// console.log( matches, balance, text, tag, balance[ 1 ] ); // eslint-disable-line no-console
// A `null` value means it is equivalent to no matches i.e. an empty array.
matches = ( matches ) ? matches : [];
// Handle cases where the word is ending with period for **word category**.
// Iterate in [ m0 b1 m1 ... ] pattern as `b0` has no value here.
// *** COMMENTED out after `pushWordToken()`:
// k = 0;
// if ( tag === categories.word ) {
// for ( i = 1, imax = balance.length; i < imax; i += 1 ) {
// t = balance[ i ];
// if ( k < matches.length && t[ 0 ] === '.' ) {
// tp = matches[ k ] + '.';
// if ( isLexeme( tp ) || rgxShortForm.test( tp ) ) {
// matches[ k ] = tp;
// balance[ i ] = t.slice( 1 );
// }
// }
// k += 1;
// }
// }
// console.log( matches, balance, text, tag, balance[ 1 ] ); // eslint-disable-line no-console
// Combine tokens & matches in the following pattern [ b0 m0 b1 m1 ... ]
k = 0;
for ( i = 0, imax = balance.length; i < imax; i += 1 ) {
t = balance[ i ];
t = t.trim();
if ( t ) tokens.push( t );
if ( k < matches.length ) {
if ( tag === categories.word ) {
// Handle special cases for words via:
pushWordToken( matches[ k ], tokens );
} else {
tokens.push( [ matches[ k ], tag ] );
}
}
k += 1;
}
return ( tokens );
}; // tokenizeTextUnit()
// ### tokenizeTextRecursively
/**
*
* Tokenizes the input text recursively using the array of `regexes` and then
* the `tokenizeTextUnit()` function. If (or whenever) the `regexes` becomes
* empty, it simply splits the text on non-word characters instead of using
* the `tokenizeTextUnit()` function.
*
* @param {string} text unit that is to be tokenized.
* @param {object} regexes object containing the regex and it's category.
* @return {undefined} nothing!
* @private
*/
var tokenizeTextRecursively = function ( text, regexes ) {
var sentence = text.trim();
var tokens = [];
// Helpers – for loop variables & token category.
var i, imax;
var cat;
if ( !regexes.length ) {
// No regex left, this is the true **unk**.
// Becuase it is `UNK`, we can use `addToken` instead of attempting
// `addTokenIfInCache`.
addToken( text, categories.unk, ps, nonBreakingSpaces );
ps = 0;
return;
}
var rgx = regexes[ 0 ];
tokens = tokenizeTextUnit( sentence, rgx );
for ( i = 0, imax = tokens.length; i < imax; i += 1 ) {
if ( typeof tokens[ i ] === 'string' ) {
// Strings become candidates for further tokenization.
tokenizeTextRecursively( tokens[ i ], regexes.slice( 1 ) );
} else {
// Use the passed value of preceding spaces only once!
// First try cache, otherwise make a direct addition. This ensures
// processing of expansions.
cat = addTokenIfInCache( tokens[ i ][ 0 ], ps, nonBreakingSpaces );
if ( cat === categories.unk ) addToken( tokens[ i ][ 0 ], tokens[ i ][ 1 ], ps, nonBreakingSpaces );
// Reset `ps` to **0** as there can never be spaces in a text passed to
// this tokenizer.
ps = 0;
}
}
}; // tokenizeTextRecursively()
// ### tokenize
/**
*
* Tokenizes the input `sentence` using the function `tokenizeTextRecursively()`.
* This acts as the fall back tokenizer to the **linear tokenizer**.
*
* @method Tokenizer#tokenize
* @param {RegExp} rgxs containg regexes for parsing.
* @param {string} text the input sentence.
* @param {number} precedingSpaces to the text
* @param {object} doc contains the document; used here for adding tokens.
* @param {array} nbsp contains non breaking spaces details.
* @return {void} nothing!
* `value` and its `tag` identifying the type of the token.
* @private
*/
var tokenize = function ( rgxs, text, precedingSpaces, doc, nbsp ) {
// Cache frequently used doc methods.
addToken = doc._addToken;
addTokenIfInCache = doc._addTokenIfInCache;
isLexeme = doc.isLexeme;
// Set `ps` to the passed value of preceding spaces, it will be reset to **0**
// after first use during recursion.
ps = precedingSpaces;
nonBreakingSpaces = nbsp;
tokenizeTextRecursively( text, rgxs, precedingSpaces );
}; // tokenize()
return tokenize;
};
module.exports = tokenizer;