wink-porter2-stemmer
Version:
Implementation of Porter Stemmer Algorithm V2 by Dr Martin F Porter
366 lines (345 loc) • 11.9 kB
JavaScript
// wink-porter2-stemmer
// Implementation of Porter Stemmer Algorithm V2 by Dr Martin F Porter
//
// Copyright (C) 2017-19 GRAYPE Systems Private Limited
//
// This file is part of “wink-porter2-stemmer”.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
// Implements the Porter Stemmer Algorithm V2 by Dr Martin F Porter.
// Reference: https://snowballstem.org/algorithms/english/stemmer.html
// ## Regex Definitions
// Regex definition of `double`.
var rgxDouble = /(bb|dd|ff|gg|mm|nn|pp|rr|tt)$/;
// Definition for Step Ia suffixes.
var rgxSFXsses = /(.+)(sses)$/;
var rgxSFXiedORies2 = /(.{2,})(ied|ies)$/;
var rgxSFXiedORies1 = /(.{1})(ied|ies)$/;
var rgxSFXusORss = /(.+)(us|ss)$/;
var rgxSFXs = /(.+)(s)$/;
// Definition for Step Ib suffixes.
var rgxSFXeedlyOReed = /(.*)(eedly|eed)$/;
var rgxSFXedORedlyORinglyORing = /([aeiouy].*)(ed|edly|ingly|ing)$/;
var rgxSFXatORblORiz = /(at|bl|iz)$/;
// Definition for Step Ic suffixes.
var rgxSFXyOR3 = /(.+[^aeiouy])([y3])$/;
// Definition for Step II suffixes; note we have spot the longest suffix.
var rgxSFXstep2 = /(ization|ational|fulness|ousness|iveness|tional|biliti|lessli|entli|ation|alism|aliti|ousli|iviti|fulli|enci|anci|abli|izer|ator|alli|bli|ogi|li)$/;
var rgxSFXstep2WithReplacements = [
// Length 7.
{ rgx: /ational$/, replacement: 'ate' },
{ rgx: /ization$/, replacement: 'ize' },
{ rgx: /fulness$/, replacement: 'ful' },
{ rgx: /ousness$/, replacement: 'ous' },
{ rgx: /iveness$/, replacement: 'ive' },
// Length 6.
{ rgx: /tional$/, replacement: 'tion' },
{ rgx: /biliti$/, replacement: 'ble' },
{ rgx: /lessli$/, replacement: 'less' },
// Length 5.
{ rgx: /iviti$/, replacement: 'ive' },
{ rgx: /ousli$/, replacement: 'ous' },
{ rgx: /ation$/, replacement: 'ate' },
{ rgx: /entli$/, replacement: 'ent' },
{ rgx: /(.*)(alism|aliti)$/, replacement: '$1al' },
{ rgx: /fulli$/, replacement: 'ful' },
// Length 4.
{ rgx: /alli$/, replacement: 'al' },
{ rgx: /ator$/, replacement: 'ate' },
{ rgx: /izer$/, replacement: 'ize' },
{ rgx: /enci$/, replacement: 'ence' },
{ rgx: /anci$/, replacement: 'ance' },
{ rgx: /abli$/, replacement: 'able' },
// Length 3.
{ rgx: /bli$/, replacement: 'ble' },
{ rgx: /(.*)(l)(ogi)$/, replacement: '$1$2og' },
// Length 2.
{ rgx: /(.*)([cdeghkmnrt])(li)$/, replacement: '$1$2' }
];
// Definition for Step III suffixes; once again spot the longest one first!
var rgxSFXstep3 = /(ational|tional|alize|icate|iciti|ative|ical|ness|ful)$/;
var rgxSFXstep3WithReplacements = [
{ rgx: /ational$/, replacement: 'ate' },
{ rgx: /tional$/, replacement: 'tion' },
{ rgx: /alize$/, replacement: 'al' },
{ rgx: /(.*)(icate|iciti|ical)$/, replacement: '$1ic' },
{ rgx: /(ness|ful)$/, replacement: '' },
];
// Definition for Step IV suffixes.
var rgxSFXstep4 = /(ement|ance|ence|able|ible|ment|ant|ent|ism|ate|iti|ous|ive|ize|al|er|ic)$/;
var rgxSFXstep4Full = /(ement|ance|ence|able|ible|ment|ant|ent|ism|ate|iti|ous|ive|ize|ion|al|er|ic)$/;
var rgxSFXstep4ion = /(.*)(s|t)(ion)$/;
// Exceptions Set I.
var exceptions1 = Object.create( null );
// Mapped!
exceptions1.skis = 'ski';
exceptions1.skies = 'sky';
exceptions1.dying = 'die';
exceptions1.lying = 'lie';
exceptions1.tying = 'tie';
exceptions1.idly = 'idl';
exceptions1.gently = 'gentl';
exceptions1.ugly = 'ugli';
exceptions1.early = 'earli';
exceptions1.only = 'onli';
exceptions1.singly = 'singl';
// Invariants!
exceptions1.sky = 'sky';
exceptions1.news = 'news';
exceptions1.atlas = 'atlas';
exceptions1.cosmos = 'cosmos';
exceptions1.bias = 'bias';
exceptions1.andes = 'andes';
// Exceptions Set II.
// Note, these are to be treated as full words.
var rgxException2 = /^(inning|outing|canning|herring|proceed|exceed|succeed|earring)$/;
// ## Private functions
// ### prelude
/**
* Performs initial pre-processing by transforming the input string `s` as
* per the replacements.
*
* @param {String} s Input string
* @return {String} Processed string
* @private
*/
var prelude = function ( s ) {
return ( s
// Handle `y`'s.
.replace( /^y/, '3' )
.replace( /([aeiou])y/, '$13' )
// Handle apostrophe.
.replace( /\’s$|\'s$/, '' )
.replace( /s\’$|s\'$/, '' )
.replace( /[\’\']$/, '' )
);
}; // prelude()
// ### isShort
/**
* @param {String} s Input string
* @return {Boolean} `true` if `s` is a short syllable, `false` otherwise
* @private
*/
var isShort = function ( s ) {
// (a) a vowel followed by a non-vowel other than w, x or 3 and
// preceded by a non-vowel, **or** (b) a vowel at the beginning of the word
// followed by a non-vowel.
return (
(
(
( /[^aeiouy][aeiouy][^aeiouywx3]$/ ).test( s ) ||
( /^[aeiouy][^aeiouy]{0,1}$/ ).test( s ) // Removed this new changed??
)
)
);
}; // isShort()
// ### markRegions
/**
* @param {String} s Input string
* @return {Object} the `R1` and `R2` regions as an object from the input string `s`.
* @private
*/
var markRegions = function ( s ) {
// Matches of `R1` and `R2`.
var m1, m2;
// To detect regions i.e. `R1` and `R2`.
var rgxRegions = /[aeiouy]+([^aeiouy]{1}.+)/;
m1 = rgxRegions.exec( s );
if ( !m1 ) return ( { r1: '', r2: '' } );
m1 = m1[ 1 ].slice( 1 );
// Handle exceptions here to prevent over stemming.
m1 = ( ( /^(gener|commun|arsen)/ ).test( s ) ) ? s.replace( /^(gener|commun|arsen)(.*)/, '$2') : m1;
m2 = rgxRegions.exec( m1 );
if ( !m2 ) return ( { r1: m1, r2: '' } );
m2 = m2[ 1 ].slice( 1 );
return ( { r1: m1, r2: m2 } );
}; // markRegions()
// ### step1a
/**
* @param {String} s Input string
* @return {String} Processed string
* @private
*/
var step1a = function ( s ) {
var wordPart;
if ( rgxSFXsses.test( s ) ) return ( s.replace( rgxSFXsses, '$1ss' ) );
if ( rgxSFXiedORies2.test( s ) ) return ( s.replace( rgxSFXiedORies2, '$1i' ) );
if ( rgxSFXiedORies1.test( s ) ) return ( s.replace( rgxSFXiedORies1, '$1ie' ) );
if ( rgxSFXusORss.test( s ) ) return ( s );
wordPart = s.replace( rgxSFXs, '$1' );
if ( ( /[aeiuouy](.+)$/ ).test( wordPart ) ) return ( s.replace( rgxSFXs, '$1' ) );
return ( s );
}; // step1a()
// ### step1b
/**
* @param {String} s Input string
* @return {String} Processed string
* @private
*/
var step1b = function ( s ) {
var rgn = markRegions( s ),
sd;
// Search for the longest among the `eedly|eed` suffixes.
if ( rgxSFXeedlyOReed.test( s ) )
// Replace by ee if in R1.
return ( rgxSFXeedlyOReed.test( rgn.r1 ) ? s.replace( rgxSFXeedlyOReed, '$1ee' ) : s );
// Delete `ed|edly|ingly|ing` if the preceding word part contains a vowel.
if ( rgxSFXedORedlyORinglyORing.test( s ) ) {
sd = s.replace( rgxSFXedORedlyORinglyORing, '$1' );
rgn = markRegions( sd );
// And after deletion, return either
return ( rgxSFXatORblORiz.test( sd ) ) ? ( sd + 'e' ) :
// or
( rgxDouble.test( sd ) ) ? ( sd.replace( /.$/, '' ) ) :
// or
( ( isShort( sd ) ) && ( rgn.r1 === '' ) ) ? ( sd + 'e' ) :
// or
sd;
}
return ( s );
}; // step1b()
// ### step1c
/**
* @param {String} s Input string
* @return {String} Processed string
* @private
*/
var step1c = function ( s ) {
return ( s.replace( rgxSFXyOR3, '$1i') );
}; // step1c()
// ### step2
/**
* @param {String} s Input string
* @return {String} Processed string
* @private
*/
var step2 = function ( s ) {
var i, imax,
rgn = markRegions( s ),
us; // updated s.
var match = s.match( rgxSFXstep2 );
match = ( match === null ) ? '$$$$$' : match[ 1 ];
if ( rgn.r1.indexOf( match ) !== -1 ) {
for ( i = 0, imax = rgxSFXstep2WithReplacements.length; i < imax; i += 1 ) {
us = s.replace( rgxSFXstep2WithReplacements[ i ].rgx, rgxSFXstep2WithReplacements[ i ].replacement );
if ( s !== us ) return ( us );
}
}
return ( s );
}; // step2()
// ### step3
/**
* @param {String} s Input string
* @return {String} Processed string
* @private
*/
var step3 = function ( s ) {
var i, imax,
rgn = markRegions( s ),
us; // updated s.
var match = s.match( rgxSFXstep3 );
match = ( match === null ) ? '$$$$$' : match[ 1 ];
if ( rgn.r1.indexOf( match ) !== -1 ) {
for ( i = 0, imax = rgxSFXstep3WithReplacements.length; i < imax; i += 1 ) {
us = s.replace( rgxSFXstep3WithReplacements[ i ].rgx, rgxSFXstep3WithReplacements[ i ].replacement );
if ( s !== us ) return ( us );
}
if ( ( /ative/ ).test( rgn.r2 ) ) return s.replace( /ative$/, '' );
}
return ( s );
}; // step3()
// ### step4
/**
* @param {String} s Input string
* @return {String} Processed string
* @private
*/
var step4 = function ( s ) {
var rgn = markRegions( s );
var match = s.match( rgxSFXstep4Full );
match = ( match === null ) ? '$$$$$' : match[ 1 ];
if ( rgxSFXstep4Full.test( s ) && rgn.r2.indexOf( match ) !== -1 ) {
return rgxSFXstep4.test( s ) ? s.replace( rgxSFXstep4, '' ) :
(
rgxSFXstep4ion.test( s ) ?
s.replace( rgxSFXstep4ion, '$1$2') :
s
);
}
return ( s );
}; // step4()
// ### step5
/**
* @param {String} s Input string
* @return {String} Processed string
* @private
*/
var step5 = function ( s ) {
var preceding, rgn;
// Search for the `e` suffixes.
rgn = markRegions( s );
if ( ( /e$/i ).test( s ) ) {
preceding = s.replace( /e$/, '' );
return (
// Found: delete if in R2, or in R1 and not preceded by a short syllable
( /e/ ).test( rgn.r2 ) || ( ( /e/ ).test( rgn.r1 ) && !isShort( preceding ) ) ?
preceding : s
);
}
// Search for the `l` suffixes.
if ( ( /l$/ ).test( s ) ) {
rgn = markRegions( s );
// Found: delete if in R2
return ( rgn.r2 && ( /l$/ ).test( rgn.r2 ) ? s.replace( ( /ll$/ ), 'l' ) : s );
}
// If nothing happens, must return the string!
return ( s );
}; // step5()
// ## Public functions
// ### stem
/**
*
* Stems an inflected `word` using Porter2 stemming algorithm.
*
* @param {string} word — word to be stemmed.
* @return {string} — the stemmed word.
*
* @example
* stem( 'consisting' );
* // -> consist
*/
var stem = function ( word ) {
var str = word.toLowerCase();
if ( str.length < 3 ) return ( str );
if ( exceptions1[ str ] ) return ( exceptions1[ str ] );
str = prelude( str );
str = step1a( str );
if ( !rgxException2.test( str ) ) {
str = step1b( str );
str = step1c( str );
str = step2( str );
str = step3( str );
str = step4( str );
str = step5( str );
}
str = str.replace( /3/g , 'y' );
return ( str );
}; // stem()
// Export stem function.
module.exports = stem;