node-wordnet
Version:
Node.js interface for Wordnet
526 lines (413 loc) • 16.1 kB
text/coffeescript
## Copyright (c) 2011, Chris Umbel
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to deal
## in the Software without restriction, including without limitation the rights
## to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
## copies of the Software, and to permit persons to whom the Software is
## furnished to do so, subject to the following conditions:
##
## The above copyright notice and this permission notice shall be included in
## all copies or substantial portions of the Software.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
## THE SOFTWARE.
##
## Significant changes made by Stuart Watt, including:
## (1) - implementation of logic for morphological exceptions
## (2) - using sense offsets as per Perl implementations
## (3) - porting to CoffeeScript for easier validation and better array performance
## (4) - promisification of much of the API
## (5) - move to use wndb-with-exceptions instead of WNdb, to provide morphological exceptions
## (6) - significant improvements in testing
IndexFile = require './index_file'
DataFile = require './data_file'
async = require 'async'
Promise = require 'bluebird'
path = require 'path'
fs = require 'fs'
LRU = require 'lru-cache'
require('es6-shim')
class WordNet
constructor: (options) ->
## For compatibility, if the options are a string, it's just the Wordnet path
if typeof options == 'string'
options = {dataDir: options}
else
options ?= {}
if ! options.dataDir?
try
WNdb = require('wndb-with-exceptions')
catch e
console.error("Please 'npm install wndb-with-exceptions' before using WordNet module or specify a dict directory.")
throw e
options.dataDir = WNdb.path
if ! options.cache
@cache = null
else
if options.cache == true
options.cache = {
max: 2000
}
if typeof options.cache == 'object' and typeof options.cache.get == 'function'
@cache = options.cache
else
@cache = LRU options.cache
@path = options.dataDir
@nounIndex = new IndexFile(@path, 'noun')
@verbIndex = new IndexFile(@path, 'verb')
@adjIndex = new IndexFile(@path, 'adj')
@advIndex = new IndexFile(@path, 'adv')
@nounData = new DataFile(@path, 'noun')
@verbData = new DataFile(@path, 'verb')
@adjData = new DataFile(@path, 'adj')
@advData = new DataFile(@path, 'adv')
@allFiles = [
{index: @nounIndex, data: @nounData, pos: 'n'}
{index: @verbIndex, data: @verbData, pos: 'v'}
{index: @adjIndex, data: @adjData, pos: 'a'}
{index: @advIndex, data: @advData, pos: 'r'}
]
get: (synsetOffset, pos, callback) ->
wordnet = @
if @cache
query = "get:#{synsetOffset}:#{pos}"
if hit = wordnet.cache.get query
if callback.length == 1
return callback.call wordnet, hit
else
return callback.call wordnet, null, hit
dataFile = wordnet.getDataFile(pos)
dataFile.get synsetOffset, (err, result) ->
wordnet.cache.set query, result if query && !err?
if callback.length == 1
callback.call wordnet, result
else
callback.call wordnet, err, result
getAsync: (synsetOffset, pos) ->
wordnet = @
new Promise (resolve, reject) ->
wordnet.get synsetOffset, pos, (err, data) ->
if err?
reject err
else
resolve data
lookup: (input, callback) ->
wordnet = @
[word, pos] = input.split('#')
lword = word.toLowerCase().replace(/\s+/g, '_')
if @cache
query = "lookup:#{input}"
if hit = wordnet.cache.get query
if callback.length == 1
return callback.call wordnet, hit
else
return callback.call wordnet, null, hit
selectedFiles = if ! pos then wordnet.allFiles.slice() else wordnet.allFiles.filter (file) -> file.pos == pos
wordnet.lookupFromFiles selectedFiles, [], lword, (err, results) ->
return callback.call wordnet, err if err?
wordnet.cache.set query, results if query
if callback.length == 1
return callback.call wordnet, results
else
return callback.call wordnet, null, results
lookupAsync: (input, callback) ->
wordnet = @
new Promise (resolve, reject) ->
wordnet.lookup input, (err, data) ->
if err?
reject err
else
resolve data
findSense: (input, callback) ->
wordnet = @
[word, pos, senseNumber] = input.split('#')
if @cache
query = "findSense:#{input}"
if hit = wordnet.cache.get query
if callback.length == 1
return callback.call wordnet, hit
else
return callback.call wordnet, null, hit
sense = parseInt(senseNumber)
if Number.isNaN(sense)
throw new Error("Sense number should be an integer")
else if sense < 1
throw new Error("Sense number should be a positive integer")
lword = word.toLowerCase().replace(/\s+/g, '_')
selectedFiles = wordnet.allFiles.filter (file) -> file.pos == pos
wordnet.lookupFromFiles selectedFiles, [], lword, (err, response) ->
return callback.call wordnet, err if err?
result = response[sense - 1]
wordnet.cache.set query, result if query
if callback.length == 1
callback.call wordnet, result
else
callback.call wordnet, null, result
findSenseAsync: (input) ->
wordnet = @
new Promise (resolve, reject) ->
wordnet.findSense input, (err, data) ->
if err?
reject err
else
resolve data
querySense: (input, callback) ->
wordnet = @
[word, pos] = input.split('#')
if @cache
query = "querySense:#{input}"
if hit = wordnet.cache.get query
if callback.length == 1
return callback.call wordnet, hit
else
return callback.call wordnet, null, hit
wordnet.lookup input, (err, results) ->
return callback.call wordnet, err if err?
senseCounts = {}
senses = for sense, i in results
pos = sense.pos
pos = 'a' if pos == 's'
senseCounts[pos] ?= 1
word + "#" + pos + "#" + senseCounts[pos]++
wordnet.cache.set query, senses if query
if callback.length == 1
callback.call wordnet, senses
else
callback.call wordnet, null, senses
querySenseAsync: (input) ->
wordnet = @
new Promise (resolve, reject) ->
wordnet.querySense input, (err, data) ->
if err?
reject err
else
resolve data
lookupFromFiles: (files, results, word, callback) ->
wordnet = @
if files.length == 0
callback.call wordnet, null, results
else
file = files.pop()
file.index.lookup word, (err, record) ->
if record
wordnet.pushResults file.data, results, record.synsetOffset, () ->
wordnet.lookupFromFiles files, results, word, callback
else
wordnet.lookupFromFiles files, results, word, callback
pushResults: (data, results, offsets, callback) ->
wordnet = @
if offsets.length == 0
callback(results)
else
data.get offsets.pop(), (err, record) ->
results.push(record)
wordnet.pushResults(data, results, offsets, callback)
loadResultSynonyms: (synonyms, results, callback) ->
wordnet = this
if results.length > 0
result = results.pop()
wordnet.loadSynonyms synonyms, results, result.ptrs, callback
else
callback(synonyms)
loadSynonyms: (synonyms, results, ptrs, callback) ->
wordnet = this
if ptrs.length > 0
ptr = ptrs.pop()
@get ptr.synsetOffset, ptr.pos, (result) ->
synonyms.push(result)
wordnet.loadSynonyms synonyms, results, ptrs, callback
else
wordnet.loadResultSynonyms synonyms, results, callback
lookupSynonyms: (word, callback) ->
wordnet = this
wordnet.lookup word, (results) ->
wordnet.loadResultSynonyms [], results, callback
getSynonyms: () ->
wordnet = this
callback = if arguments[2] then arguments[2] else arguments[1]
pos = if arguments[0].pos then arguments[0].pos else arguments[1]
synsetOffset = if arguments[0].synsetOffset then arguments[0].synsetOffset else arguments[0]
@get synsetOffset, pos, (result) ->
wordnet.loadSynonyms [], [], result.ptrs, callback
getDataFile: (pos) ->
switch pos
when 'n' then @nounData
when 'v' then @verbData
when 'a', 's' then @adjData
when 'r' then @advData
## Exceptions aren't part of the node.js source, but they are needed to map some of
## the exceptions in derivations. Really, these should be loaded in the constructor, but
## sadly this code is asynchronous and we really don't want to force everything to
## block here. That's why a move to promises would be helpful, because all the dependent
## code is also going to be asynchronous and we can chain when we need to. For now, though,
## we'll handle it with callbacks when needed.
exceptions = [
{name: "noun.exc", pos: 'n'},
{name: "verb.exc", pos: 'v'},
{name: "adj.exc", pos: 'a'},
{name: "adv.exc", pos: 'r'},
]
_loadExceptions = (wordnet, callback) ->
## Flag while loading, so anyone who tries to use it can check and wait until the load
## is complete, instead of multiple loads happening at once.
WordNet::exceptions = 'pending'
loadFile = (exception, callback) ->
fullPath = path.join wordnet.path, exception.name
fs.readFile fullPath, (err, data) ->
return callback(err) if err
temp = {}
lines = data.toString().split("\n")
for line in lines
if line.length > 0
[term1, term2...] = line.split(' ')
temp[term1] ?= []
Array.prototype.push.apply temp[term1], term2
callback null, {pos: exception.pos, data: temp}
async.map exceptions, loadFile, (err, results) ->
exceptions = {}
for result in results
exceptions[result.pos] = result.data
WordNet::exceptions = exceptions
callback()
close: () ->
@nounIndex.close()
@verbIndex.close()
@adjIndex.close()
@advIndex.close()
@nounData.close()
@verbData.close()
@adjData.close()
@advData.close()
## Implementation of validForms. This isn't part of the original node.js Wordnet,
## and has instead been adapted from WordNet::QueryData. This helps to map words
## to WordNet by allowing different forms to be considered. Obviously, it's highly
## specific to English.
unique = (a) ->
found = {}
a.filter (item) ->
if found[item]
false
else
found[item] = true
tokenDetach = (string) ->
[word, pos, sense] = string.split('#')
detach = [word]
length = word.length
switch pos
when 'n'
detach.push word.substring(0, length - 1) if word.endsWith("s")
detach.push word.substring(0, length - 2) if word.endsWith("ses")
detach.push word.substring(0, length - 2) if word.endsWith("xes")
detach.push word.substring(0, length - 2) if word.endsWith("zes")
detach.push word.substring(0, length - 2) if word.endsWith("ches")
detach.push word.substring(0, length - 2) if word.endsWith("shes")
detach.push word.substring(0, length - 3) + "man" if word.endsWith("men")
detach.push word.substring(0, length - 3) + "y" if word.endsWith("ies")
when 'v'
detach.push word.substring(0, length - 1) if word.endsWith("s")
detach.push word.substring(0, length - 3) + "y" if word.endsWith("ies")
detach.push word.substring(0, length - 2) if word.endsWith("es")
detach.push word.substring(0, length - 1) if word.endsWith("ed")
detach.push word.substring(0, length - 2) if word.endsWith("ed")
detach.push word.substring(0, length - 3) + "e" if word.endsWith("ing")
detach.push word.substring(0, length - 3) if word.endsWith("ing")
when 'r'
detach.push word.substring(0, length - 2) if word.endsWith("er")
detach.push word.substring(0, length - 1) if word.endsWith("er")
detach.push word.substring(0, length - 3) if word.endsWith("est")
detach.push word.substring(0, length - 2) if word.endsWith("est")
unique(detach)
_forms = (wordnet, word, pos) ->
lword = word.toLowerCase()
## First check to see if we have an exception set
exception = wordnet.exceptions[pos]?[lword]
return [word].concat(exception) if exception
tokens = word.split(/[ _]/g)
## If a single term, process using tokenDetach
if tokens.length == 1
return tokenDetach(tokens[0] + "#" + pos)
## Otherwise, handle the forms recursively
forms = tokens.map (token) -> _forms(wordnet, token, pos)
## Now generate all possible token sequences (collocations)
rtn = []
index = (0 for token in tokens)
while true
colloc = forms[0][index[0]]
for i in [1..(tokens.length - 1)]
colloc = colloc + '_' + forms[i][index[i]]
rtn.push colloc
i = 0
while i < tokens.length
index[i] = index[i] + 1
if index[i] < forms[i].length
break
else
index[i] = 0
i = i + 1
if i >= tokens.length
break
return rtn
forms = (wordnet, string) ->
[word, pos, sense] = string.split('#')
rtn = _forms(wordnet, word, pos)
(element + "#" + pos for element in rtn)
_validForms = (wordnet, string, callback) ->
[word, pos, sense] = string.split('#')
if ! pos
## No POS, so use a reduce to try them all and concatenate
reducer = (previous, current, next) ->
_validForms wordnet, string + "#" + current, (err, value) ->
if value == undefined
next(null, previous)
else
next(null, previous.concat(value))
async.reduce ['n', 'v', 'a', 'r'], [], reducer, (err, result) ->
callback null, result
else
possibleForms = forms(wordnet, word + "#" + pos)
filteredResults = []
eachFn = (term, done) ->
wordnet.lookup term, (err, data) ->
if err?
return done(err)
filteredResults.push term if data.length > 0
done()
async.each possibleForms, eachFn, (err) ->
callback err, filteredResults
_validFormsWithExceptions = (wordnet, string, callback) ->
if wordnet.exceptions == undefined
_loadExceptions wordnet, () ->
_validFormsWithExceptions(wordnet, string, callback)
else if wordnet.exceptions == 'pending'
setImmediate _validFormsWithExceptions, wordnet, string, callback
else
_validForms(wordnet, string, callback)
validForms: (string, callback) ->
wordnet = @
if @cache
query = "validForms:#{string}"
if hit = wordnet.cache.get query
if callback.length == 1
return callback.call wordnet, hit
else
return callback.call wordnet, null, hit
_validFormsWithExceptions @, string, (err, result) ->
wordnet.cache.set query, result if query
if callback.length == 1
return callback.call wordnet, result
else
return callback.call wordnet, null, result
validFormsAsync: (string) ->
new Promise (resolve, reject) =>
@validForms string, (err, data) ->
if err?
reject err
else
resolve data
module.exports = WordNet