UNPKG

@predictive-text-studio/models-templates

Version:

Backing model implementations (templates) for Keyman's modelling layer

github.com/keymanapp/keyman

eddieantonio/keyman

733 lines • 34.3 kB

JavaScript

var models; (function (models) { models.SENTINEL_CODE_UNIT = '\uFDD0'; function applyTransform(transform, context) { // First, get the current context var fullLeftContext = context.left || ''; var lLen = fullLeftContext.length; var lDel = lLen < transform.deleteLeft ? lLen : transform.deleteLeft; var leftContext = fullLeftContext.substring(0, lLen - lDel) + (transform.insert || ''); var fullRightContext = context.right || ''; var rLen = fullRightContext.length; var rDel = rLen < transform.deleteRight ? rLen : transform.deleteRight; var rightContext = fullRightContext.substring(rDel); return { left: leftContext, right: rightContext, startOfBuffer: context.startOfBuffer, endOfBuffer: context.endOfBuffer }; } models.applyTransform = applyTransform; /** * * @param transform Merges one transform into another, mutating the first parameter to * include the effects of the second. * @param prefix */ function prependTransform(transform, prefix) { transform.insert = prefix.insert + transform.insert; transform.deleteLeft += prefix.deleteLeft; if (prefix.deleteRight) { transform.deleteRight = (transform.deleteRight || 0) + prefix.deleteRight; } } models.prependTransform = prependTransform; function isHighSurrogate(codeUnit) { if (typeof codeUnit == 'string') { codeUnit = codeUnit.charCodeAt(0); } return codeUnit >= 0xD800 && codeUnit <= 0xDBFF; } models.isHighSurrogate = isHighSurrogate; function isSentinel(char) { return char == models.SENTINEL_CODE_UNIT; } models.isSentinel = isSentinel; })(models || (models = {})); /** * @file priority-queue.ts * * Defines a mildly abstracted priority queue implementation. */ var models; (function (models) { var PriorityQueue = /** @class */ (function () { /** * Constructs an empty priority queue. * @param comparator A `Comparator` returning negative values when and only when * the first parameter should precede the second parameter. */ function PriorityQueue(comparator, initialEntries) { if (initialEntries === void 0) { initialEntries = []; } // TODO: We may wish to allow options specifying a limit or threshold for adding // items to the priority queue. Possibly both. // // When that time comes, consider a min-max heap. // https://en.wikipedia.org/wiki/Min-max_heap this.comparator = comparator; this.heap = Array.from(initialEntries); this.heapify(); } PriorityQueue.leftChildIndex = function (index) { return index * 2 + 1; }; PriorityQueue.rightChildIndex = function (index) { return index * 2 + 2; }; PriorityQueue.parentIndex = function (index) { return Math.floor((index - 1) / 2); }; PriorityQueue.prototype.heapify = function (start, end) { if (start == undefined || end == undefined) { this.heapify(0, this.count - 1); } // Use of 'indices' here is a bit of a customization. // At the cost of (temporary) extra storage space, we can more efficiently enqueue // multiple elements simultaneously. var queuedIndices = []; var lastParent = -1; for (var i = end; i >= start; i--) { var parent = PriorityQueue.parentIndex(i); if (this.siftDown(i) && parent < start && lastParent != parent) { // We only need to queue examination for a heap node if its children have changed // and it isn't already being examined. queuedIndices.push(parent); lastParent = parent; } } lastParent = -1; while (queuedIndices.length > 0) { var index = queuedIndices.shift(); var parent = PriorityQueue.parentIndex(index); if (this.siftDown(index) && parent >= 0 && lastParent != parent) { // We only need to queue examination for a heap node if its children have changed. queuedIndices.push(parent); lastParent = parent; } } }; Object.defineProperty(PriorityQueue.prototype, "count", { /** * Returns the number of elements currently held by the priority queue. */ get: function () { return this.heap.length; }, enumerable: true, configurable: true }); /** * Returns the highest-priority item within the priority queue. * <p> * Is O(1). */ PriorityQueue.prototype.peek = function () { return this.heap[0]; // undefined if it doesn't exist... which is completely correct. }; /** * Inserts a new element into the priority queue, placing it in order. * <p> * Is O(log N), where N = # of items in the priority queue. * @param element */ PriorityQueue.prototype.enqueue = function (element) { var index = this.heap.length; this.heap.push(element); var parent = PriorityQueue.parentIndex; var parentIndex = parent(index); while (index !== 0 && this.comparator(this.heap[index], this.heap[parentIndex]) < 0) { var a = this.heap[index]; this.heap[index] = this.heap[parentIndex]; this.heap[parentIndex] = a; index = parentIndex; parentIndex = parent(index); } }; /** * Efficiently batch-enqueues multiple elements. * Worst-case is the _better_ of the following: * - O(`elements.count` + `heap.count`) - large element counts will trigger in-place * heap reconstruction. * - O(`elements.count` * log(`heap.count`)) - logarithmic when elements.count << heap.count * @param elements A group of elements to enqueue simultaneously. */ PriorityQueue.prototype.enqueueAll = function (elements) { if (elements.length == 0) { return; } var firstIndex = this.count; this.heap = this.heap.concat(elements); var firstParent = PriorityQueue.parentIndex(firstIndex); // The 'parent' of index 0 will return -1, which is illegal. this.heapify(firstParent >= 0 ? firstParent : 0, PriorityQueue.parentIndex(this.count - 1)); }; /** * Removes the highest-priority element from the queue, returning it. * <p> * Is O(log N), where N = number of items in the priority queue. */ PriorityQueue.prototype.dequeue = function () { if (this.count == 0) { return undefined; } var root = this.heap[0]; var tail = this.heap.pop(); if (this.heap.length > 0) { this.heap[0] = tail; this.siftDown(0); } return root; }; /** * Compares the entry at the specified index against its children, * propagating it downward within the heap until heap requirements are specified. * <p> * Is O(log N), where N = number of items in the priority queue. * * @param index The index of the top-most node that must be examined * for repositioning. * @returns `true` if a swap occurred, `false` otherwise. */ PriorityQueue.prototype.siftDown = function (index) { var leftIndex = PriorityQueue.leftChildIndex(index); var rightIndex = PriorityQueue.rightChildIndex(index); var topMostIndex = index; if (leftIndex < this.heap.length && this.comparator(this.heap[leftIndex], this.heap[topMostIndex]) < 0) { topMostIndex = leftIndex; } if (rightIndex < this.heap.length && this.comparator(this.heap[rightIndex], this.heap[topMostIndex]) < 0) { topMostIndex = rightIndex; } if (topMostIndex != index) { var a = this.heap[index]; this.heap[index] = this.heap[topMostIndex]; this.heap[topMostIndex] = a; this.siftDown(topMostIndex); return true; } else { return false; } }; /** * Returns an array containing all entries of the priority queue. * Altering the returned array will not affect the queue, but mutating * the array's elements can cause unintended side effects. * * This function makes no guarantees on the ordering of the returned elements; * they will almost certainly be unsorted. */ PriorityQueue.prototype.toArray = function () { return Array.from(this.heap); }; return PriorityQueue; }()); models.PriorityQueue = PriorityQueue; })(models || (models = {})); /* * Copyright (c) 2019 National Research Council Canada (author: Eddie A. Santos) * Copyright (c) 2019 SIL International * Copyright (c) 2015–2017 Conrad Irwin * Copyright (c) 2011–2015 Marc Campbell * * Permission is hereby granted, free of charge, to any person obtaining a copy of * this software and associated documentation files (the "Software"), to deal in * the Software without restriction, including without limitation the rights to * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of * the Software, and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ var __generator = (this && this.__generator) || function (thisArg, body) { var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g; return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; function verb(n) { return function (v) { return step([n, v]); }; } function step(op) { if (f) throw new TypeError("Generator is already executing."); while (_) try { if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; if (y = 0, t) op = [op[0] & 2, t.value]; switch (op[0]) { case 0: case 1: t = op; break; case 4: _.label++; return { value: op[1], done: false }; case 5: _.label++; y = op[1]; op = [0]; continue; case 7: op = _.ops.pop(); _.trys.pop(); continue; default: if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } if (t[2]) _.ops.pop(); _.trys.pop(); continue; } op = body.call(thisArg, _); } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; } }; /// <reference path="common.ts" /> /// <reference path="priority-queue.ts" /> /** * @file trie-model.ts * * Defines a simple word list (unigram) model. */ var models; (function (models) { /** Upper bound on the amount of suggestions to generate. */ var MAX_SUGGESTIONS = 12; /** * @class TrieModel * * Defines a trie-backed word list model, or the unigram model. * Unigram models throw away all preceding words, and search * for the next word exclusively. As such, they can perform simple * prefix searches within words, however they are not very good * at predicting the next word. */ var TrieModel = /** @class */ (function () { function TrieModel(trieData, options) { if (options === void 0) { options = {}; } this._trie = new Trie(trieData['root'], trieData['totalWeight'], options.searchTermToKey || defaultSearchTermToKey); this.breakWords = options.wordBreaker || getDefaultWordBreaker(); this.punctuation = options.punctuation; } TrieModel.prototype.configure = function (capabilities) { return this.configuration = { leftContextCodePoints: capabilities.maxLeftContextCodePoints, rightContextCodePoints: capabilities.maxRightContextCodePoints }; }; TrieModel.prototype.predict = function (transform, context) { // Special-case the empty buffer/transform: return the top suggestions. if (!transform.insert && context.startOfBuffer && context.endOfBuffer) { return makeDistribution(this._trie.firstN(MAX_SUGGESTIONS).map(function (_a) { var text = _a.text, p = _a.p; return ({ transform: { insert: text, deleteLeft: 0 }, displayAs: text, p: p }); })); } // Compute the results of the keystroke: var newContext = models.applyTransform(transform, context); // Computes the different in word length after applying the transform above. var leftDelOffset = transform.deleteLeft - transform.insert.kmwLength(); // All text to the left of the cursor INCLUDING anything that has // just been typed. var prefix = this.getLastWord(newContext.left); // Return suggestions from the trie. return makeDistribution(this._trie.lookup(prefix).map(function (_a) { var text = _a.text, p = _a.p; return ({ transform: { // Insert the suggestion from the Trie, verbatim insert: text, // Delete whatever the prefix that the user wrote. // Note: a separate capitalization/orthography engine can take this // result and transform it as needed. deleteLeft: leftDelOffset + prefix.kmwLength(), }, displayAs: text, p: p }); })); /* Helper */ function makeDistribution(suggestions) { var distribution = []; for (var _i = 0, suggestions_1 = suggestions; _i < suggestions_1.length; _i++) { var s = suggestions_1[_i]; distribution.push({ sample: s, p: s.p }); } return distribution; } }; /** * Get the last word of the phrase, or nothing. * @param fullLeftContext the entire left context of the string. */ TrieModel.prototype.getLastWord = function (fullLeftContext) { var words = this.breakWords(fullLeftContext); if (words.length > 0) { return words.pop().text; } return ''; }; TrieModel.prototype.wordbreak = function (context) { return this.getLastWord(context.left); }; TrieModel.prototype.traverseFromRoot = function () { return new TrieModel.Traversal(this._trie['root'], ''); }; TrieModel.Traversal = /** @class */ (function () { function class_1(root, prefix) { this.root = root; this.prefix = prefix; } class_1.prototype.children = function () { var root, _loop_1, this_1, _i, _a, entry, prefix_1, children, _loop_2, _b, children_1, key; return __generator(this, function (_c) { switch (_c.label) { case 0: root = this.root; if (!(root.type == 'internal')) return [3 /*break*/, 5]; _loop_1 = function (entry) { var entryNode, internalNode_1, _loop_3, _i, _a, lowSurrogate, fullText, prefix_2, prefix_3; return __generator(this, function (_b) { switch (_b.label) { case 0: entryNode = root.children[entry]; if (!models.isHighSurrogate(entry)) return [3 /*break*/, 8]; if (!(entryNode.type == 'internal')) return [3 /*break*/, 5]; internalNode_1 = entryNode; _loop_3 = function (lowSurrogate) { var prefix; return __generator(this, function (_a) { switch (_a.label) { case 0: prefix = this_1.prefix + entry + lowSurrogate; return [4 /*yield*/, { char: entry + lowSurrogate, traversal: function () { return new TrieModel.Traversal(internalNode_1.children[lowSurrogate], prefix); } }]; case 1: _a.sent(); return [2 /*return*/]; } }); }; _i = 0, _a = internalNode_1.values; _b.label = 1; case 1: if (!(_i < _a.length)) return [3 /*break*/, 4]; lowSurrogate = _a[_i]; return [5 /*yield**/, _loop_3(lowSurrogate)]; case 2: _b.sent(); _b.label = 3; case 3: _i++; return [3 /*break*/, 1]; case 4: return [3 /*break*/, 7]; case 5: fullText = entryNode.entries[0].key; entry = entry + fullText[this_1.prefix.length + 1]; // The other half of the non-BMP char. prefix_2 = this_1.prefix + entry; return [4 /*yield*/, { char: entry, traversal: function () { return new TrieModel.Traversal(entryNode, prefix_2); } }]; case 6: _b.sent(); _b.label = 7; case 7: return [3 /*break*/, 12]; case 8: if (!models.isSentinel(entry)) return [3 /*break*/, 9]; return [2 /*return*/, "continue"]; case 9: if (!!entry) return [3 /*break*/, 10]; return [2 /*return*/, "continue"]; case 10: prefix_3 = this_1.prefix + entry; return [4 /*yield*/, { char: entry, traversal: function () { return new TrieModel.Traversal(entryNode, prefix_3); } }]; case 11: _b.sent(); _b.label = 12; case 12: return [2 /*return*/]; } }); }; this_1 = this; _i = 0, _a = root.values; _c.label = 1; case 1: if (!(_i < _a.length)) return [3 /*break*/, 4]; entry = _a[_i]; return [5 /*yield**/, _loop_1(entry)]; case 2: _c.sent(); _c.label = 3; case 3: _i++; return [3 /*break*/, 1]; case 4: return [2 /*return*/]; case 5: prefix_1 = this.prefix; children = root.entries.filter(function (entry) { return entry.key != prefix_1 && prefix_1.length < entry.key.length; }); _loop_2 = function (key) { var nodeKey; return __generator(this, function (_a) { switch (_a.label) { case 0: nodeKey = key[prefix_1.length]; if (models.isHighSurrogate(nodeKey)) { // Merge the other half of an SMP char in! nodeKey = nodeKey + key[prefix_1.length + 1]; } return [4 /*yield*/, { char: nodeKey, traversal: function () { return new TrieModel.Traversal(root, prefix_1 + nodeKey); } }]; case 1: _a.sent(); return [2 /*return*/]; } }); }; _b = 0, children_1 = children; _c.label = 6; case 6: if (!(_b < children_1.length)) return [3 /*break*/, 9]; key = children_1[_b].key; return [5 /*yield**/, _loop_2(key)]; case 7: _c.sent(); _c.label = 8; case 8: _b++; return [3 /*break*/, 6]; case 9: ; return [2 /*return*/]; } }); }; Object.defineProperty(class_1.prototype, "entries", { get: function () { if (this.root.type == 'leaf') { var prefix_4 = this.prefix; var matches = this.root.entries.filter(function (entry) { return entry.key == prefix_4; }); return matches.map(function (value) { return value.content; }); } else { var matchingLeaf = this.root.children[models.SENTINEL_CODE_UNIT]; if (matchingLeaf && matchingLeaf.type == 'leaf') { return matchingLeaf.entries.map(function (value) { return value.content; }); } else { return []; } } }, enumerable: true, configurable: true }); return class_1; }()); return TrieModel; }()); models.TrieModel = TrieModel; ; /** * Wrapper class for the trie and its nodes. */ var Trie = /** @class */ (function () { function Trie(root, totalWeight, wordform2key) { this.root = root; this.toKey = wordform2key; this.totalWeight = totalWeight; } /** * Lookups an arbitrary prefix (a query) in the trie. Returns the top 3 * results in sorted order. * * @param prefix */ Trie.prototype.lookup = function (prefix) { var searchKey = this.toKey(prefix); var lowestCommonNode = findPrefix(this.root, searchKey); if (lowestCommonNode === null) { return []; } return getSortedResults(lowestCommonNode, searchKey, this.totalWeight); }; /** * Returns the top N suggestions from the trie. * @param n How many suggestions, maximum, to return. */ Trie.prototype.firstN = function (n) { return getSortedResults(this.root, '', this.totalWeight, n); }; return Trie; }()); /** * Finds the deepest descendent in the trie with the given prefix key. * * This means that a search in the trie for a given prefix has a best-case * complexity of O(m) where m is the length of the prefix. * * @param key The prefix to search for. * @param index The index in the prefix. Initially 0. */ function findPrefix(node, key, index) { if (index === void 0) { index = 0; } // An important note - the Trie itself is built on a per-JS-character basis, // not on a UTF-8 character-code basis. if (node.type === 'leaf' || index === key.length) { return node; } // So, for SMP models, we need to match each char of the supplementary pair // in sequence. Each has its own node in the Trie. var char = key[index]; if (node.children[char]) { return findPrefix(node.children[char], key, index + 1); } return null; } /** * Returns all entries matching the given prefix, in descending order of * weight. * * @param prefix the prefix to match. * @param results the current results * @param queue */ function getSortedResults(node, prefix, N, limit) { if (limit === void 0) { limit = MAX_SUGGESTIONS; } var queue = new models.PriorityQueue(function (a, b) { // In case of Trie compilation issues that emit `null` or `undefined` return (b ? b.weight : 0) - (a ? a.weight : 0); }); var results = []; if (node.type === 'leaf') { // Assuming the values are sorted, we can just add all of the values in the // leaf, until we reach the limit. for (var _i = 0, _a = node.entries; _i < _a.length; _i++) { var item = _a[_i]; if (item.key.startsWith(prefix)) { var content = item.content, weight = item.weight; results.push({ text: content, p: weight / N }); if (results.length >= limit) { return results; } } } } else { queue.enqueue(node); var next = void 0; var _loop_4 = function () { if (isNode(next)) { // When a node is next up in the queue, that means that next least // likely suggestion is among its decsendants. // So we search all of its descendants! if (next.type === 'leaf') { queue.enqueueAll(next.entries); } else { // XXX: alias `next` so that TypeScript can be SURE that internal is // in fact an internal node. Because of the callback binding to the // original definition of node (i.e., a Node | Entry), this will not // type-check otherwise. var internal_1 = next; queue.enqueueAll(next.values.map(function (char) { return internal_1.children[char]; })); } } else { // When an entry is up next in the queue, we just add its contents to // the results! results.push({ text: next.content, p: next.weight / N }); if (results.length >= limit) { return { value: results }; } } }; while (next = queue.dequeue()) { var state_1 = _loop_4(); if (typeof state_1 === "object") return state_1.value; } } return results; } /** TypeScript type guard that returns whether the thing is a Node. */ function isNode(x) { return 'type' in x; } /** * Converts wordforms into an indexable form. It does this by * normalizing into NFD, removing diacritics, and then converting * the result to lowercase. * * This is a very naïve implementation, that I only think will work on * some languages that use the Latin script. As of 2020-04-08, only * 4 out of 11 (36%) of published language models use the Latin script, * so this might not actually be a great default. * * This uses String.prototype.normalize() to convert normalize into NFD. * NFD is an easy way to separate a Latin character from its diacritics; * Even then, some Latin-based orthographies use code points that, * under NFD normalization, do NOT decompose into an ASCII letter and a * combining diacritical mark (e.g., SENĆOŦEN). * * Use this only in early iterations of the model. For a production lexical * model, you SHOULD write/generate your own key function, tailored to your * language. */ function defaultSearchTermToKey(wordform) { /** * N.B.: this is (slightly) DIFFERENT than the version in * keymanapp/lexical-model-compiler/build-trie * as this is for compatibility for models built * BEFORE the searchTermToKey function was bundled with * all models. * * This compatibility version lowercases AFTER removing diacritics; * the new version (bundled in future models) lowercases, * NFD normalizes, THEN removes diacritics. */ return wordform .normalize('NFD') // Remove all combining diacritics (if input is in NFD) // common to Latin-orthographies. .replace(/[\u0300-\u036f]/g, '') .toLowerCase(); } function getDefaultWordBreaker() { var namespace; // @ts-ignore if (typeof wordBreakers !== 'undefined') { // @ts-ignore namespace = wordBreakers; } else { namespace = require('@keymanapp/models-wordBreakers').wordBreakers; } return namespace['default']; } })(models || (models = {})); /// <reference path="./trie-model.ts" /> // Add all namespaces defined here to the global scope: if (typeof module != 'undefined' && typeof module.exports != 'undefined') { module.exports['models'] = models; } //# sourceMappingURL=index.js.map