UNPKG

node-nlp

Version:

Library for NLU (Natural Language Understanding) done in Node.js

701 lines (678 loc) 23.6 kB
/* * Copyright (c) AXA Shared Services Spain S.A. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ const Recognizers = require('@microsoft/recognizers-text-suite'); const { SimilarSearch } = require('../util'); const EnumNamedEntity = require('./enum-named-entity'); const NlpUtil = require('../nlp/nlp-util'); const RegexNamedEntity = require('./regex-named-entity'); const TrimNamedEntity = require('./trim-named-entity'); const BuiltinDictionary = require('./builtin-dictionary.json'); const BuiltinInverse = require('./builtin-inverse.json'); /** * Class for a named entities manager, that can be recognized inside * a string. * Basically, a named entity is an enumerator that has a set of options, * and each option can have several words to refer to this option. * Example: * We have the entity "Superhero". Inside "Superhero" we have different * options: * - Superman: Superman * - Spiderman: Spiderman, Spider-man * - Wolverine: Wolverine, Logan, Patch, Weapon X */ class NerManager { /** * Constructor of the class. * @param {Object} settings Settings for initializing this instance. */ constructor(settings) { this.settings = settings || {}; this.threshold = this.settings.threshold || 0.8; this.namedEntities = {}; this.similar = new SimilarSearch({ normalize: true }); this.builtins = this.settings.builtins || [ 'Number', 'Ordinal', 'Percentage', 'Age', 'Currency', 'Dimension', 'Temperature', 'DateTime', 'PhoneNumber', 'IpAddress', 'Boolean', 'Email', 'Hashtag', 'URL', ]; let list = this.settings.builtinWhitelist || [ 'age', 'currency', 'dimension', 'temperature', 'number', 'numberrange', 'ordinal', 'percentage', 'email', 'hashtag', 'ip', 'mention', 'phonenumber', 'url', 'date', 'daterange', 'datetime', 'datetimealt', 'time', 'set', 'timerange', 'timezone', 'boolean', 'duration', ]; this.builtinWhitelist = {}; for (let i = 0; i < list.length; i += 1) { this.builtinWhitelist[list[i]] = true; } list = this.settings.builtinBlacklist || []; for (let i = 0; i < list.length; i += 1) { delete this.builtinWhitelist[list[i]]; } } /** * Given a resolution unit in source language, translate into default language. * @param {string} str Resolution unit in source language. * @param {string} locale Source language locale. * @returns {string} Translation or source resolution unit. */ translate(str, locale) { if (BuiltinDictionary[locale]) { const translation = BuiltinDictionary[locale][str]; return translation !== '' ? translation : str; } return str; } /** * Given a resolution unit in default language, translate into source language. * @param {string} str Resolution unit in default language. * @param {string} locale Source language locale. * @returns {string} Translation or default resolution unit. */ inverseTranslate(str, locale) { if (BuiltinInverse[locale]) { const translation = BuiltinInverse[locale][str]; if (translation && translation.length > 0) { return translation[0]; } } return str; } /** * Creates a new instance of a named entity. * @param {string} entityName Name of the entity * @param {string} type Type of the entity * @returns {NamedEntity} New named entity. */ newNamedEntity(entityName, type = 'enum') { const options = { name: entityName }; switch (type.toLowerCase()) { case 'regex': return new RegexNamedEntity(options); case 'trim': return new TrimNamedEntity(options); default: return new EnumNamedEntity(options); } } /** * Adds a new entity to be managed by the NER. If the entity already exists, * then returns the already existing one. * @param {String} entityName Name of the entity. * @param {String} type Type of the entity. * @returns {Object} Already existing entity or the new one created. */ addNamedEntity(entityName, type) { if (this.namedEntities[entityName]) { return this.namedEntities[entityName]; } const entity = this.newNamedEntity(entityName, type); this.namedEntities[entityName] = entity; return entity; } /** * Get an entity given its name. If the entity does not exists and the * force flag is on, then creates the entity. * @param {String} entityName Name of the entity. * @param {boolean} force Flag to create the entity when it does not exists. * @returns {Object} The entity, or undefined if not found and not forced. */ getNamedEntity(entityName, force = false) { return force ? this.addNamedEntity(entityName) : this.namedEntities[entityName]; } /** * Removes an entity from the NER. * @param {String} entityName Name of the entity. */ removeNamedEntity(entityName) { delete this.namedEntities[entityName]; } /** * Add texts to the given languages of an option of an entity. * @param {String} entityName Name of the entity. * @param {String} optionName Name of the option. * @param {String[]} srcLanguages Language or languages for adding the texts. * @param {String[]} srcTexts Text or texts to be added. */ addNamedEntityText(entityName, optionName, srcLanguages, srcTexts) { const entity = this.getNamedEntity(entityName, true); entity.addText(optionName, srcLanguages, srcTexts); } /** * Remove texts for the given languages of the option of an entity. * @param {String} entityName Name of the entity. * @param {String} optionName Name of the option. * @param {String[]} srcLanguages Languages affected. * @param {String[]} srcTexts Texts to be removed. */ removeNamedEntityText(entityName, optionName, srcLanguages, srcTexts) { const entity = this.getNamedEntity(entityName); if (entity) { entity.removeText(optionName, srcLanguages, srcTexts); } } /** * Given an utterance, search for %entity% format inside it, in order to * return the list of entities referenced by the utterance. * @param {String} utterance Utterance for searching. * @returns {String[]} List of entities. */ getEntitiesFromUtterance(utterance) { const entityKeys = Object.keys(this.namedEntities); const result = []; entityKeys.forEach(entity => { if (utterance.indexOf(`%${entity}%`) > -1) { result.push(entity); } }); return result; } /** * Given an entity and a locale, calculate the resolution. * @param {Object} entity Entity instance with resolution. * @param {string} locale Entity language locale. * @returns {Object} Calculated resolution for the entity. */ calculateResolution(entity, locale) { const { resolution } = entity; if (['number', 'ordinal', 'percentage'].includes(entity.typeName)) { let resValue = resolution.value; if (resValue) { resValue = resValue.replace(',', '.'); } const value = Number.parseFloat(resValue); return { strValue: resValue, value, subtype: value % 1 === 0 ? 'integer' : 'float', }; } if ( entity.typeName === 'datetimeV2.date' || entity.typeName === 'datetimeV2.daterange' ) { if (resolution.values) { if (resolution.values.length === 1) { const resValue = resolution.values[0]; const result = { type: resValue.type, timex: resValue.timex, }; if (resValue.value) { result.strValue = resValue.value; result.date = new Date(resValue.value); } return result; } if (resolution.values.length === 2) { const result = { type: 'interval', timex: resolution.values[0].timex, }; if (resolution.values[0].value) { result.strPastValue = resolution.values[0].value; result.pastDate = new Date(result.strPastValue); } if (resolution.values[0].start) { result.strPastStartValue = resolution.values[0].start; result.pastStartDate = new Date(result.strPastStartValue); } if (resolution.values[0].end) { result.strPastEndValue = resolution.values[0].end; result.pastEndDate = new Date(result.strPastEndValue); } if (resolution.values[1].value) { result.strFutureValue = resolution.values[1].value; result.futureDate = new Date(result.strFutureValue); } if (resolution.values[1].start) { result.strFutureStartValue = resolution.values[1].start; result.futureStartDate = new Date(result.strFutureStartValue); } if (resolution.values[1].end) { result.strFutureEndValue = resolution.values[1].end; result.futureEndDate = new Date(result.strFutureEndValue); } return result; } } } if (!resolution) { return undefined; } if (resolution.unit) { const srcUnit = resolution.unit; resolution.srcUnit = srcUnit; resolution.unit = this.translate(srcUnit, locale); if (resolution.srcUnit === resolution.unit) { resolution.srcUnit = this.inverseTranslate(resolution.srcUnit, locale); } } if (resolution.srcUnit) { return { strValue: resolution.value, value: Number.parseFloat(resolution.value), unit: resolution.unit, localeUnit: resolution.srcUnit, }; } return resolution; } /** * Given an array of edges, detect the trim edges and find overlaps with * non trim edges. When an overlap is detected, reduce the trim edged to * fit with the other edge. * @param {Object[]} edges Edges to be splited * @returns {Object[]} Splited edges. */ splitEdges(edges) { for (let i = 0, l = edges.length; i < l; i += 1) { const edge = edges[i]; if ( edge.type === 'between' || edge.type === 'after' || edge.type === 'afterLast' || edge.type === 'afterFirst' || edge.type === 'before' || edge.type === 'beforeFirst' || edge.type === 'beforeLast' ) { for (let j = 0; j < edges.length; j += 1) { const other = edges[j]; if ( i !== j && other.start >= edge.start && other.end <= edge.end && other.type !== 'between' && other.type !== 'after' && other.type !== 'afterLast' && other.type !== 'afterFirst' && other.type !== 'before' && other.type !== 'beforeFirst' && other.type !== 'beforeLast' ) { const edgeLen = edge.end - edge.start; const otherLen = other.end - other.start; if (other.start - edge.start > edge.end - other.end) { // is at the beginning const text = edge.sourceText.substring(0, edgeLen - otherLen - 1); edge.sourceText = text; edge.utteranceText = text; edge.end = other.start - 2; edge.len = text.length; } else { const text = edge.sourceText.substring(edgeLen - otherLen + 2); edge.sourceText = text; edge.utteranceText = text; edge.start = other.end + 2; edge.len = text.length; } } } } } return edges; } /** * Execute a pre-reduction of edges before running the final reduce edges. * @param {Object[]} edges Array of edges. * @returns {Object[]} Array of reduced edges. */ prereduceEdges(edges) { for (let i = 0, l = edges.length; i < l; i += 1) { const edge = edges[i]; if (!edge.discarded) { for (let j = i + 1; j < l; j += 1) { const other = edges[j]; if (!other.discarded) { if (other.start === edge.start && other.end === edge.end) { if (other.entity === 'number' && edge.entiy === 'ordinal') { other.discarded = true; } else if ( other.entity === 'ordinal' && edge.entity === 'number' ) { edge.discarded = true; } else if ( other.entity === edge.entity && edge.entity === 'number' ) { if ( (other.sourceText.includes(',') || other.sourceText.includes('.')) && parseFloat(other.sourceText.replace(',', '.')) !== parseFloat(other.resolution.strValue.replace(',', '.')) ) { other.discarded = true; } if ( (edge.sourceText.includes(',') || edge.sourceText.includes('.')) && parseFloat(edge.sourceText.replace(',', '.')) !== parseFloat(edge.resolution.strValue.replace(',', '.')) ) { edge.discarded = true; } } } } } } } const result = []; for (let i = 0, l = edges.length; i < l; i += 1) { if (!edges[i].discarded) { result.push(edges[i]); } } return result; } /** * Extract built-in entities for the utterance given the language. * @param {string} utterance Input utterance. * @param {string} language Language locale. * @returns {Object[]} Extracted entities as edges array. */ findBuiltinEntities(utterance, language) { const result = []; const culture = NlpUtil.getCulture(language); this.builtins.forEach(name => { try { const entities = Recognizers[`recognize${name}`](utterance, culture); if (name === 'Number') { entities.push( ...Recognizers.recognizeNumber(utterance, NlpUtil.getCulture('en')) ); } for (let i = 0; i < entities.length; i += 1) { const entity = entities[i]; let entityName = entity.typeName; const index = entityName.lastIndexOf('.'); if (index !== -1) { entityName = entityName.slice(index + 1); } if (this.builtinWhitelist[entityName]) { const text = utterance.slice(entity.start, entity.end + 1); const accuracy = 0.95; const edge = { start: entity.start, end: entity.end, len: entity.end - entity.start + 1, accuracy, sourceText: text, utteranceText: text, entity: entityName, }; const resolution = this.calculateResolution(entity, language); if (resolution) { edge.resolution = resolution; } result.push(edge); } } } catch (ex) { // } }); return this.similar .reduceEdges(this.prereduceEdges(result), false) .sort((a, b) => a.start - b.start); } /** * Find entities inside an utterance. * @param {String} utterance Utterance for searching entities. * @param {String} locale Locale of the language. * @param {String[]} whitelist Whitelist of entity names. * @returns {Promise.Object[]} Promise edges of entities found. */ async findEntities(utterance, language, whitelist) { const entityNames = whitelist || Object.keys(this.namedEntities); const wordPositions = this.similar.getWordPositions(utterance); const edges = this.findBuiltinEntities(utterance, language); entityNames.forEach(entityName => { const entity = this.namedEntities[entityName]; if (entity) { const newEdges = entity.extract( utterance, language, this.similar, wordPositions, this.threshold ); newEdges.forEach(edge => { edges.push(edge); }); } }); return this.similar.reduceEdges(this.splitEdges(edges), false); } findNamedEntities(utterance, language, whitelist) { const entityNames = whitelist || Object.keys(this.namedEntities); const wordPositions = this.similar.getWordPositions(utterance); const edges = []; entityNames.forEach(entityName => { const entity = this.namedEntities[entityName]; if (entity) { const newEdges = entity.extract( utterance, language, this.similar, wordPositions, this.threshold ); newEdges.forEach(edge => { edges.push(edge); }); } }); return this.similar.reduceEdges(this.splitEdges(edges), false); } /** * Find entities on utterance, and replace them by the entity name. * @param {String} utterance Utterance to be processed. * @param {String} locale Locale of the utterance. * @returns {Promise.String} Promise utterance with entities replaced by entity name. */ async generateEntityUtterance(utterance, locale) { const entities = await this.findEntities(utterance, locale); if (entities.length === 0) { return utterance; } let index = 0; let result = ''; for (let i = 0; i < entities.length; i += 1) { const entity = entities[i]; const left = utterance.slice(index, entity.start); index = entity.end; result += left; result += `%${entity.entity}%`; } const right = utterance.slice(entities[entities.length - 1].end + 1); result += right; return result; } generateNamedEntityUtterance(utterance, locale) { const entities = this.findNamedEntities(utterance, locale); if (entities.length === 0) { return undefined; } let index = 0; let result = ''; for (let i = 0; i < entities.length; i += 1) { const entity = entities[i]; const left = utterance.slice(index, entity.start); index = entity.end; result += left; result += `%${entity.entity}%`; } const right = utterance.slice(entities[entities.length - 1].end + 1); result += right; return result; } /** * Returns a clone object representing this, for saving. * @returns {Object} Clone object. */ save() { const result = {}; result.settings = this.settings; result.threshold = this.threshold; result.builtins = this.builtins; result.namedEntities = {}; const keys = Object.keys(this.namedEntities); for (let i = 0; i < keys.length; i += 1) { const entity = this.namedEntities[keys[i]]; const clone = { type: entity.type, name: entity.name, localeFallback: entity.localeFallback, }; if (entity.type === 'enum') { clone.locales = entity.locales; } else if (entity.type === 'regex') { clone.locales = {}; const localeKeys = Object.keys(entity.locales); for (let j = 0; j < localeKeys.length; j += 1) { clone.locales[localeKeys[j]] = { regex: RegexNamedEntity.regex2str( entity.locales[localeKeys[j]].regex ), }; } } else if (entity.type === 'trim') { clone.locales = {}; const localeKeys = Object.keys(entity.locales); for (let j = 0; j < localeKeys.length; j += 1) { clone.locales[localeKeys[j]] = { conditions: [], }; const { conditions } = entity.locales[localeKeys[j]]; const cloneConditions = clone.locales[localeKeys[j]].conditions; for (let k = 0; k < conditions.length; k += 1) { const condition = conditions[k]; const cloneCondition = {}; cloneCondition.type = condition.type; cloneCondition.options = condition.options; if (condition.type === 'between') { cloneCondition.leftWords = condition.leftWords; cloneCondition.rightWords = condition.rightWords; cloneCondition.regex = RegexNamedEntity.regex2str( condition.regex ); } else { cloneCondition.words = condition.words; } cloneConditions.push(cloneCondition); } } } result.namedEntities[keys[i]] = clone; } return result; } /** * Load this instance from an object. * @param {Object} obj Object to load from. */ load(obj) { this.settings = obj.settings; this.threshold = obj.threshold; this.builtins = obj.builtins || []; const keys = Object.keys(obj.namedEntities); for (let i = 0; i < keys.length; i += 1) { const cloned = obj.namedEntities[keys[i]]; const entity = this.addNamedEntity(cloned.name, cloned.type); if (cloned.type === 'enum') { entity.locales = cloned.locales; } else if (cloned.type === 'regex') { const localeKeys = Object.keys(cloned.locales); for (let j = 0; j < localeKeys.length; j += 1) { entity.locales[localeKeys[j]] = { regex: RegexNamedEntity.str2regex( cloned.locales[localeKeys[j]].regex ), }; } } else if (cloned.type === 'trim') { const localeKeys = Object.keys(cloned.locales); for (let j = 0; j < localeKeys.length; j += 1) { entity.locales[localeKeys[j]] = { conditions: [], }; const clonedConditions = cloned.locales[localeKeys[j]].conditions; const { conditions } = entity.locales[localeKeys[j]]; for (let k = 0; k < clonedConditions.length; k += 1) { const condition = { type: clonedConditions[k].type, options: clonedConditions[k].options, }; if (condition.type === 'between') { condition.leftWords = clonedConditions[k].leftWords; condition.rightWords = clonedConditions[k].rightWords; condition.regex = RegexNamedEntity.str2regex( clonedConditions[k].regex ); } else { condition.words = clonedConditions[k].words; } conditions.push(condition); } } } } } } module.exports = NerManager;