UNPKG

symspell-ex

Version:

Spelling correction & Fuzzy search based on symmetric delete spelling correction algorithm

304 lines (303 loc) 12.4 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.SymSpellEx = void 0; const core_1 = require("./core"); const editDistance_1 = require("./core/nlp/editDistance"); const tokenizers_1 = require("./core/nlp/tokenizers"); const DEFAULT_MAX_DISTANCE = 2; const DEFAULT_MAX_SUGGESTIONS = 5; class SymSpellEx { constructor(store, editDistance = new editDistance_1.DamerauLevenshteinDistance(), tokenizer = new tokenizers_1.CoreTokenizer(), maxDistance = DEFAULT_MAX_DISTANCE, maxSuggestions = DEFAULT_MAX_SUGGESTIONS) { this._language = core_1.Languages.ENGLISH; this._isInitialized = false; this.store = store || null; this.editDistance = editDistance; this._tokenizer = tokenizer; this._maxDistance = maxDistance; this._maxSuggestions = maxSuggestions; } async initialize() { await this.store.initialize(); this._isInitialized = true; } isInitialized() { return this._isInitialized; } get maxDistance() { return this._maxDistance; } get maxSuggestions() { return this._maxSuggestions; } async setLanguage(language) { this._checkForReadiness(); this._language = language; await this.store.setLanguage(language); } get language() { return this._language; } _checkForReadiness() { if (!this._isInitialized) { throw new Error('SymSpellEx must be initialized, Please call initialize() first'); } } edits(word, min, max, deletes) { deletes = deletes || new Set(); min++; let deletedItem; let l = word.length; let i = 0; if (l > 1) { for (i = 0; i < l; i += 1) { deletedItem = word.substring(0, i) + word.substring(i + 1); if (!deletes.has(deletedItem)) { deletes.add(deletedItem); if (min < max) { this.edits(deletedItem, min, max, deletes); } } } } return deletes; } async filterAndRankSuggestions(suggestions, max) { if (suggestions == null || suggestions.length <= 0) { return []; } return suggestions.sort((a, b) => { if (a.distance < b.distance) { return -1; } else if (a.distance === b.distance) { if (a.frequency >= b.frequency) { return -1; } } return 1; }).filter((i, index) => index < max); } async lookup(term, language, maxDistance, maxSuggestions) { this._checkForReadiness(); const iLanguage = language || this._language; const iMaxDistance = maxDistance || this._maxDistance; const iMaxSuggestions = maxSuggestions || this._maxSuggestions; const iTerm = term.toLowerCase().trim(); const iLength = iTerm.length; const maxKeyLength = await this.store.maxEntryLength(); if (iLanguage !== this._language) { await this.store.setLanguage(iLanguage); } if (iLength - iMaxDistance > maxKeyLength) { return []; } let termsCache = []; let entriesCache = {}; let candidate; let candidateHasHigherDistance = false; let inputCandidateDistance = 0; const candidates = [iTerm]; const candidateSet = new Set(); let suggestions = []; const suggestionSet = new Set(); while (candidates.length > 0) { candidate = candidates.shift(); inputCandidateDistance = iLength - candidate.length; candidateHasHigherDistance = suggestions.length > 0 && inputCandidateDistance > suggestions[0].distance; if (candidateHasHigherDistance) { break; } const entry = await this.store.getEntry(candidate); if (entry != null) { if (entry[0] > 0 && !suggestionSet.has(candidate)) { const suggestion = new core_1.Suggestion(term, candidate, inputCandidateDistance, entry[0]); suggestionSet.add(candidate); suggestions.push(suggestion); if (inputCandidateDistance === 0) { break; } } termsCache = await this.store.getTermsAt(entry); for (let i = 1; i < entry.length; i += 1) { const sIndex = entry[i]; const sTerm = termsCache[i] != null ? termsCache[i] : await this.store.getTermAt(sIndex); if (suggestionSet.has(sTerm)) { continue; } suggestionSet.add(sTerm); // Computing distance between candidate & suggestion let distance = 0; if (iTerm !== sTerm) { if (sTerm.length === candidate.length) { distance = iLength - candidate.length; } else if (iLength === candidate.length) { distance = sTerm.length - candidate.length; } else { let ii = 0; let jj = 0; const sLen = sTerm.length; while (ii < sLen && ii < iLength && sTerm[ii] === iTerm[ii]) { ii++; } while (jj < sLen - ii && jj < iLength && sTerm[sLen - jj - 1] === iTerm[iLength - jj - 1]) { jj++; } if (ii > 0 || jj > 0) { distance = this.editDistance.calculateDistance(sTerm.substr(ii, sLen - ii - jj), iTerm.substr(ii, iLength - ii - jj)); } else { distance = this.editDistance.calculateDistance(sTerm, iTerm); } } } if (suggestions.length > 0) { if (distance < suggestions[0].distance) { suggestions = []; } else if (distance > suggestions[0].distance) { continue; } } if (distance <= iMaxDistance) { const suggestionEntry = await this.store.getEntry(sTerm); if (suggestionEntry != null) { suggestions.push(new core_1.Suggestion(term, sTerm, distance, suggestionEntry[0])); } } } } if (iLength - candidate.length < iMaxDistance) { if (candidateHasHigherDistance) { continue; } for (let i = 0; i < candidate.length; i++) { const deletedItem = candidate.substring(0, i) + candidate.substring(i + 1); if (!candidateSet.has(deletedItem)) { candidates.push(deletedItem); candidateSet.add(deletedItem); } } } } return this.filterAndRankSuggestions(suggestions, iMaxSuggestions); } async add(term, frequency = 1, language, maxDistance) { this._checkForReadiness(); if (term == null || term.length <= 1) return; const iLanguage = language || this._language; const iMaxDistance = maxDistance || this._maxDistance; const iTerm = term.toLowerCase().trim(); if (iLanguage !== this._language) { await this.store.setLanguage(iLanguage); } let initialEntry = true; let entry = await this.store.getEntry(iTerm); if (entry == null) { entry = new core_1.DictionaryEntry(frequency); } else { const entryFrequency = entry[0]; if (entryFrequency === 0) { entry[0] = frequency; } else { initialEntry = false; } } await this.store.setEntry(iTerm, entry); if (initialEntry) { const number = await this.store.pushTerm(iTerm) - 1; const deletes = this.edits(iTerm, 0, iMaxDistance, null); const deletesArray = Array.from(deletes); await this.store.getEntries(deletesArray) .then((items) => { items.forEach(async (item, index) => { const dKey = deletesArray[index]; if (item != null) { if (item.indexOf(number) <= 0) { item.push(number); await this.store.setEntry(dKey, item); } } else { const dEntry = new core_1.DictionaryEntry(0, number); await this.store.setEntry(dKey, dEntry); } }); }); } } /** * Train on bulk data * @param {Array<string>} terms - each item is comma separated value contains "term,frequency" * @param {string} language * @returns {Promise<void>} */ async train(terms, language) { this._checkForReadiness(); for (let i = 0; i < terms.length; i += 1) { if (terms[i] == null || terms[i].length === 0) continue; const [term, frequency] = terms[i].split(/,/); if (term == null || term.length === 0) continue; await this.add(term, parseInt(frequency) || 1, language); } } async search(input, language, maxDistance = this._maxDistance, maxSuggestions = this._maxSuggestions) { return this.lookup(input, language, maxDistance, maxSuggestions); } async correct(input, language, maxDistance = this._maxDistance) { this._checkForReadiness(); if (input == null) { return null; } const bLength = Buffer.byteLength(input, 'utf8'); const output = Buffer.alloc(bLength * 2); const suggestions = new Array(); const tokens = this._tokenizer.tokenize(input); let bOffset = 0; let tOutput; for (let i = 0; i < tokens.length; i += 1) { const token = tokens[i]; let term = token.value; let termSuggestion = new core_1.Suggestion(term, null, 0, 0); const postDistance = token.distance; if (token.tag === core_1.TokenTags.WORD && token.value.length >= 2) { await this.lookup(term, language, maxDistance, 1) .then((lSuggestions) => { if (lSuggestions.length > 0) { termSuggestion = lSuggestions[0]; } }); } // Check word first char case const caseMatch = token.value.match(/^[A-Z]/g); if (caseMatch != null) { const sTerm = termSuggestion.suggestion; if (sTerm != null) { termSuggestion.suggestion = `${sTerm.substr(0, 1).toUpperCase()}${sTerm.substr(1)}`; } } suggestions.push(termSuggestion); term = termSuggestion.suggestion != null ? termSuggestion.suggestion : termSuggestion.term; term = `${term}${String(' ').repeat(postDistance)}`; output.write(term, bOffset, 'utf8'); bOffset += Buffer.byteLength(term, 'utf8') + 1; } // Trim 0x00 from buffer tOutput = Buffer.from(output.filter((b) => b !== 0x00)) .toString('utf8'); return new core_1.Correction(input, tOutput, suggestions); } async clear() { this._checkForReadiness(); await this.store.clear(); } } exports.SymSpellEx = SymSpellEx;