@predictive-text-studio/models-templates
Version:
Backing model implementations (templates) for Keyman's modelling layer
733 lines • 34.3 kB
JavaScript
var models;
(function (models) {
models.SENTINEL_CODE_UNIT = '\uFDD0';
function applyTransform(transform, context) {
// First, get the current context
var fullLeftContext = context.left || '';
var lLen = fullLeftContext.length;
var lDel = lLen < transform.deleteLeft ? lLen : transform.deleteLeft;
var leftContext = fullLeftContext.substring(0, lLen - lDel) + (transform.insert || '');
var fullRightContext = context.right || '';
var rLen = fullRightContext.length;
var rDel = rLen < transform.deleteRight ? rLen : transform.deleteRight;
var rightContext = fullRightContext.substring(rDel);
return {
left: leftContext,
right: rightContext,
startOfBuffer: context.startOfBuffer,
endOfBuffer: context.endOfBuffer
};
}
models.applyTransform = applyTransform;
/**
*
* @param transform Merges one transform into another, mutating the first parameter to
* include the effects of the second.
* @param prefix
*/
function prependTransform(transform, prefix) {
transform.insert = prefix.insert + transform.insert;
transform.deleteLeft += prefix.deleteLeft;
if (prefix.deleteRight) {
transform.deleteRight = (transform.deleteRight || 0) + prefix.deleteRight;
}
}
models.prependTransform = prependTransform;
function isHighSurrogate(codeUnit) {
if (typeof codeUnit == 'string') {
codeUnit = codeUnit.charCodeAt(0);
}
return codeUnit >= 0xD800 && codeUnit <= 0xDBFF;
}
models.isHighSurrogate = isHighSurrogate;
function isSentinel(char) {
return char == models.SENTINEL_CODE_UNIT;
}
models.isSentinel = isSentinel;
})(models || (models = {}));
/**
* @file priority-queue.ts
*
* Defines a mildly abstracted priority queue implementation.
*/
var models;
(function (models) {
var PriorityQueue = /** @class */ (function () {
/**
* Constructs an empty priority queue.
* @param comparator A `Comparator` returning negative values when and only when
* the first parameter should precede the second parameter.
*/
function PriorityQueue(comparator, initialEntries) {
if (initialEntries === void 0) { initialEntries = []; }
// TODO: We may wish to allow options specifying a limit or threshold for adding
// items to the priority queue. Possibly both.
//
// When that time comes, consider a min-max heap.
// https://en.wikipedia.org/wiki/Min-max_heap
this.comparator = comparator;
this.heap = Array.from(initialEntries);
this.heapify();
}
PriorityQueue.leftChildIndex = function (index) {
return index * 2 + 1;
};
PriorityQueue.rightChildIndex = function (index) {
return index * 2 + 2;
};
PriorityQueue.parentIndex = function (index) {
return Math.floor((index - 1) / 2);
};
PriorityQueue.prototype.heapify = function (start, end) {
if (start == undefined || end == undefined) {
this.heapify(0, this.count - 1);
}
// Use of 'indices' here is a bit of a customization.
// At the cost of (temporary) extra storage space, we can more efficiently enqueue
// multiple elements simultaneously.
var queuedIndices = [];
var lastParent = -1;
for (var i = end; i >= start; i--) {
var parent = PriorityQueue.parentIndex(i);
if (this.siftDown(i) && parent < start && lastParent != parent) {
// We only need to queue examination for a heap node if its children have changed
// and it isn't already being examined.
queuedIndices.push(parent);
lastParent = parent;
}
}
lastParent = -1;
while (queuedIndices.length > 0) {
var index = queuedIndices.shift();
var parent = PriorityQueue.parentIndex(index);
if (this.siftDown(index) && parent >= 0 && lastParent != parent) {
// We only need to queue examination for a heap node if its children have changed.
queuedIndices.push(parent);
lastParent = parent;
}
}
};
Object.defineProperty(PriorityQueue.prototype, "count", {
/**
* Returns the number of elements currently held by the priority queue.
*/
get: function () {
return this.heap.length;
},
enumerable: true,
configurable: true
});
/**
* Returns the highest-priority item within the priority queue.
* <p>
* Is O(1).
*/
PriorityQueue.prototype.peek = function () {
return this.heap[0]; // undefined if it doesn't exist... which is completely correct.
};
/**
* Inserts a new element into the priority queue, placing it in order.
* <p>
* Is O(log N), where N = # of items in the priority queue.
* @param element
*/
PriorityQueue.prototype.enqueue = function (element) {
var index = this.heap.length;
this.heap.push(element);
var parent = PriorityQueue.parentIndex;
var parentIndex = parent(index);
while (index !== 0 && this.comparator(this.heap[index], this.heap[parentIndex]) < 0) {
var a = this.heap[index];
this.heap[index] = this.heap[parentIndex];
this.heap[parentIndex] = a;
index = parentIndex;
parentIndex = parent(index);
}
};
/**
* Efficiently batch-enqueues multiple elements.
* Worst-case is the _better_ of the following:
* - O(`elements.count` + `heap.count`) - large element counts will trigger in-place
* heap reconstruction.
* - O(`elements.count` * log(`heap.count`)) - logarithmic when elements.count << heap.count
* @param elements A group of elements to enqueue simultaneously.
*/
PriorityQueue.prototype.enqueueAll = function (elements) {
if (elements.length == 0) {
return;
}
var firstIndex = this.count;
this.heap = this.heap.concat(elements);
var firstParent = PriorityQueue.parentIndex(firstIndex);
// The 'parent' of index 0 will return -1, which is illegal.
this.heapify(firstParent >= 0 ? firstParent : 0, PriorityQueue.parentIndex(this.count - 1));
};
/**
* Removes the highest-priority element from the queue, returning it.
* <p>
* Is O(log N), where N = number of items in the priority queue.
*/
PriorityQueue.prototype.dequeue = function () {
if (this.count == 0) {
return undefined;
}
var root = this.heap[0];
var tail = this.heap.pop();
if (this.heap.length > 0) {
this.heap[0] = tail;
this.siftDown(0);
}
return root;
};
/**
* Compares the entry at the specified index against its children,
* propagating it downward within the heap until heap requirements are specified.
* <p>
* Is O(log N), where N = number of items in the priority queue.
*
* @param index The index of the top-most node that must be examined
* for repositioning.
* @returns `true` if a swap occurred, `false` otherwise.
*/
PriorityQueue.prototype.siftDown = function (index) {
var leftIndex = PriorityQueue.leftChildIndex(index);
var rightIndex = PriorityQueue.rightChildIndex(index);
var topMostIndex = index;
if (leftIndex < this.heap.length && this.comparator(this.heap[leftIndex], this.heap[topMostIndex]) < 0) {
topMostIndex = leftIndex;
}
if (rightIndex < this.heap.length && this.comparator(this.heap[rightIndex], this.heap[topMostIndex]) < 0) {
topMostIndex = rightIndex;
}
if (topMostIndex != index) {
var a = this.heap[index];
this.heap[index] = this.heap[topMostIndex];
this.heap[topMostIndex] = a;
this.siftDown(topMostIndex);
return true;
}
else {
return false;
}
};
/**
* Returns an array containing all entries of the priority queue.
* Altering the returned array will not affect the queue, but mutating
* the array's elements can cause unintended side effects.
*
* This function makes no guarantees on the ordering of the returned elements;
* they will almost certainly be unsorted.
*/
PriorityQueue.prototype.toArray = function () {
return Array.from(this.heap);
};
return PriorityQueue;
}());
models.PriorityQueue = PriorityQueue;
})(models || (models = {}));
/*
* Copyright (c) 2019 National Research Council Canada (author: Eddie A. Santos)
* Copyright (c) 2019 SIL International
* Copyright (c) 2015–2017 Conrad Irwin
* Copyright (c) 2011–2015 Marc Campbell
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* this software and associated documentation files (the "Software"), to deal in
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
* COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (_) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
/// <reference path="common.ts" />
/// <reference path="priority-queue.ts" />
/**
* @file trie-model.ts
*
* Defines a simple word list (unigram) model.
*/
var models;
(function (models) {
/** Upper bound on the amount of suggestions to generate. */
var MAX_SUGGESTIONS = 12;
/**
* @class TrieModel
*
* Defines a trie-backed word list model, or the unigram model.
* Unigram models throw away all preceding words, and search
* for the next word exclusively. As such, they can perform simple
* prefix searches within words, however they are not very good
* at predicting the next word.
*/
var TrieModel = /** @class */ (function () {
function TrieModel(trieData, options) {
if (options === void 0) { options = {}; }
this._trie = new Trie(trieData['root'], trieData['totalWeight'], options.searchTermToKey || defaultSearchTermToKey);
this.breakWords = options.wordBreaker || getDefaultWordBreaker();
this.punctuation = options.punctuation;
}
TrieModel.prototype.configure = function (capabilities) {
return this.configuration = {
leftContextCodePoints: capabilities.maxLeftContextCodePoints,
rightContextCodePoints: capabilities.maxRightContextCodePoints
};
};
TrieModel.prototype.predict = function (transform, context) {
// Special-case the empty buffer/transform: return the top suggestions.
if (!transform.insert && context.startOfBuffer && context.endOfBuffer) {
return makeDistribution(this._trie.firstN(MAX_SUGGESTIONS).map(function (_a) {
var text = _a.text, p = _a.p;
return ({
transform: {
insert: text,
deleteLeft: 0
},
displayAs: text,
p: p
});
}));
}
// Compute the results of the keystroke:
var newContext = models.applyTransform(transform, context);
// Computes the different in word length after applying the transform above.
var leftDelOffset = transform.deleteLeft - transform.insert.kmwLength();
// All text to the left of the cursor INCLUDING anything that has
// just been typed.
var prefix = this.getLastWord(newContext.left);
// Return suggestions from the trie.
return makeDistribution(this._trie.lookup(prefix).map(function (_a) {
var text = _a.text, p = _a.p;
return ({
transform: {
// Insert the suggestion from the Trie, verbatim
insert: text,
// Delete whatever the prefix that the user wrote.
// Note: a separate capitalization/orthography engine can take this
// result and transform it as needed.
deleteLeft: leftDelOffset + prefix.kmwLength(),
},
displayAs: text,
p: p
});
}));
/* Helper */
function makeDistribution(suggestions) {
var distribution = [];
for (var _i = 0, suggestions_1 = suggestions; _i < suggestions_1.length; _i++) {
var s = suggestions_1[_i];
distribution.push({ sample: s, p: s.p });
}
return distribution;
}
};
/**
* Get the last word of the phrase, or nothing.
* @param fullLeftContext the entire left context of the string.
*/
TrieModel.prototype.getLastWord = function (fullLeftContext) {
var words = this.breakWords(fullLeftContext);
if (words.length > 0) {
return words.pop().text;
}
return '';
};
TrieModel.prototype.wordbreak = function (context) {
return this.getLastWord(context.left);
};
TrieModel.prototype.traverseFromRoot = function () {
return new TrieModel.Traversal(this._trie['root'], '');
};
TrieModel.Traversal = /** @class */ (function () {
function class_1(root, prefix) {
this.root = root;
this.prefix = prefix;
}
class_1.prototype.children = function () {
var root, _loop_1, this_1, _i, _a, entry, prefix_1, children, _loop_2, _b, children_1, key;
return __generator(this, function (_c) {
switch (_c.label) {
case 0:
root = this.root;
if (!(root.type == 'internal')) return [3 /*break*/, 5];
_loop_1 = function (entry) {
var entryNode, internalNode_1, _loop_3, _i, _a, lowSurrogate, fullText, prefix_2, prefix_3;
return __generator(this, function (_b) {
switch (_b.label) {
case 0:
entryNode = root.children[entry];
if (!models.isHighSurrogate(entry)) return [3 /*break*/, 8];
if (!(entryNode.type == 'internal')) return [3 /*break*/, 5];
internalNode_1 = entryNode;
_loop_3 = function (lowSurrogate) {
var prefix;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
prefix = this_1.prefix + entry + lowSurrogate;
return [4 /*yield*/, {
char: entry + lowSurrogate,
traversal: function () { return new TrieModel.Traversal(internalNode_1.children[lowSurrogate], prefix); }
}];
case 1:
_a.sent();
return [2 /*return*/];
}
});
};
_i = 0, _a = internalNode_1.values;
_b.label = 1;
case 1:
if (!(_i < _a.length)) return [3 /*break*/, 4];
lowSurrogate = _a[_i];
return [5 /*yield**/, _loop_3(lowSurrogate)];
case 2:
_b.sent();
_b.label = 3;
case 3:
_i++;
return [3 /*break*/, 1];
case 4: return [3 /*break*/, 7];
case 5:
fullText = entryNode.entries[0].key;
entry = entry + fullText[this_1.prefix.length + 1]; // The other half of the non-BMP char.
prefix_2 = this_1.prefix + entry;
return [4 /*yield*/, {
char: entry,
traversal: function () { return new TrieModel.Traversal(entryNode, prefix_2); }
}];
case 6:
_b.sent();
_b.label = 7;
case 7: return [3 /*break*/, 12];
case 8:
if (!models.isSentinel(entry)) return [3 /*break*/, 9];
return [2 /*return*/, "continue"];
case 9:
if (!!entry) return [3 /*break*/, 10];
return [2 /*return*/, "continue"];
case 10:
prefix_3 = this_1.prefix + entry;
return [4 /*yield*/, {
char: entry,
traversal: function () { return new TrieModel.Traversal(entryNode, prefix_3); }
}];
case 11:
_b.sent();
_b.label = 12;
case 12: return [2 /*return*/];
}
});
};
this_1 = this;
_i = 0, _a = root.values;
_c.label = 1;
case 1:
if (!(_i < _a.length)) return [3 /*break*/, 4];
entry = _a[_i];
return [5 /*yield**/, _loop_1(entry)];
case 2:
_c.sent();
_c.label = 3;
case 3:
_i++;
return [3 /*break*/, 1];
case 4: return [2 /*return*/];
case 5:
prefix_1 = this.prefix;
children = root.entries.filter(function (entry) {
return entry.key != prefix_1 && prefix_1.length < entry.key.length;
});
_loop_2 = function (key) {
var nodeKey;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
nodeKey = key[prefix_1.length];
if (models.isHighSurrogate(nodeKey)) {
// Merge the other half of an SMP char in!
nodeKey = nodeKey + key[prefix_1.length + 1];
}
return [4 /*yield*/, {
char: nodeKey,
traversal: function () { return new TrieModel.Traversal(root, prefix_1 + nodeKey); }
}];
case 1:
_a.sent();
return [2 /*return*/];
}
});
};
_b = 0, children_1 = children;
_c.label = 6;
case 6:
if (!(_b < children_1.length)) return [3 /*break*/, 9];
key = children_1[_b].key;
return [5 /*yield**/, _loop_2(key)];
case 7:
_c.sent();
_c.label = 8;
case 8:
_b++;
return [3 /*break*/, 6];
case 9:
;
return [2 /*return*/];
}
});
};
Object.defineProperty(class_1.prototype, "entries", {
get: function () {
if (this.root.type == 'leaf') {
var prefix_4 = this.prefix;
var matches = this.root.entries.filter(function (entry) {
return entry.key == prefix_4;
});
return matches.map(function (value) { return value.content; });
}
else {
var matchingLeaf = this.root.children[models.SENTINEL_CODE_UNIT];
if (matchingLeaf && matchingLeaf.type == 'leaf') {
return matchingLeaf.entries.map(function (value) { return value.content; });
}
else {
return [];
}
}
},
enumerable: true,
configurable: true
});
return class_1;
}());
return TrieModel;
}());
models.TrieModel = TrieModel;
;
/**
* Wrapper class for the trie and its nodes.
*/
var Trie = /** @class */ (function () {
function Trie(root, totalWeight, wordform2key) {
this.root = root;
this.toKey = wordform2key;
this.totalWeight = totalWeight;
}
/**
* Lookups an arbitrary prefix (a query) in the trie. Returns the top 3
* results in sorted order.
*
* @param prefix
*/
Trie.prototype.lookup = function (prefix) {
var searchKey = this.toKey(prefix);
var lowestCommonNode = findPrefix(this.root, searchKey);
if (lowestCommonNode === null) {
return [];
}
return getSortedResults(lowestCommonNode, searchKey, this.totalWeight);
};
/**
* Returns the top N suggestions from the trie.
* @param n How many suggestions, maximum, to return.
*/
Trie.prototype.firstN = function (n) {
return getSortedResults(this.root, '', this.totalWeight, n);
};
return Trie;
}());
/**
* Finds the deepest descendent in the trie with the given prefix key.
*
* This means that a search in the trie for a given prefix has a best-case
* complexity of O(m) where m is the length of the prefix.
*
* @param key The prefix to search for.
* @param index The index in the prefix. Initially 0.
*/
function findPrefix(node, key, index) {
if (index === void 0) { index = 0; }
// An important note - the Trie itself is built on a per-JS-character basis,
// not on a UTF-8 character-code basis.
if (node.type === 'leaf' || index === key.length) {
return node;
}
// So, for SMP models, we need to match each char of the supplementary pair
// in sequence. Each has its own node in the Trie.
var char = key[index];
if (node.children[char]) {
return findPrefix(node.children[char], key, index + 1);
}
return null;
}
/**
* Returns all entries matching the given prefix, in descending order of
* weight.
*
* @param prefix the prefix to match.
* @param results the current results
* @param queue
*/
function getSortedResults(node, prefix, N, limit) {
if (limit === void 0) { limit = MAX_SUGGESTIONS; }
var queue = new models.PriorityQueue(function (a, b) {
// In case of Trie compilation issues that emit `null` or `undefined`
return (b ? b.weight : 0) - (a ? a.weight : 0);
});
var results = [];
if (node.type === 'leaf') {
// Assuming the values are sorted, we can just add all of the values in the
// leaf, until we reach the limit.
for (var _i = 0, _a = node.entries; _i < _a.length; _i++) {
var item = _a[_i];
if (item.key.startsWith(prefix)) {
var content = item.content, weight = item.weight;
results.push({
text: content,
p: weight / N
});
if (results.length >= limit) {
return results;
}
}
}
}
else {
queue.enqueue(node);
var next = void 0;
var _loop_4 = function () {
if (isNode(next)) {
// When a node is next up in the queue, that means that next least
// likely suggestion is among its decsendants.
// So we search all of its descendants!
if (next.type === 'leaf') {
queue.enqueueAll(next.entries);
}
else {
// XXX: alias `next` so that TypeScript can be SURE that internal is
// in fact an internal node. Because of the callback binding to the
// original definition of node (i.e., a Node | Entry), this will not
// type-check otherwise.
var internal_1 = next;
queue.enqueueAll(next.values.map(function (char) {
return internal_1.children[char];
}));
}
}
else {
// When an entry is up next in the queue, we just add its contents to
// the results!
results.push({
text: next.content,
p: next.weight / N
});
if (results.length >= limit) {
return { value: results };
}
}
};
while (next = queue.dequeue()) {
var state_1 = _loop_4();
if (typeof state_1 === "object")
return state_1.value;
}
}
return results;
}
/** TypeScript type guard that returns whether the thing is a Node. */
function isNode(x) {
return 'type' in x;
}
/**
* Converts wordforms into an indexable form. It does this by
* normalizing into NFD, removing diacritics, and then converting
* the result to lowercase.
*
* This is a very naïve implementation, that I only think will work on
* some languages that use the Latin script. As of 2020-04-08, only
* 4 out of 11 (36%) of published language models use the Latin script,
* so this might not actually be a great default.
*
* This uses String.prototype.normalize() to convert normalize into NFD.
* NFD is an easy way to separate a Latin character from its diacritics;
* Even then, some Latin-based orthographies use code points that,
* under NFD normalization, do NOT decompose into an ASCII letter and a
* combining diacritical mark (e.g., SENĆOŦEN).
*
* Use this only in early iterations of the model. For a production lexical
* model, you SHOULD write/generate your own key function, tailored to your
* language.
*/
function defaultSearchTermToKey(wordform) {
/**
* N.B.: this is (slightly) DIFFERENT than the version in
* keymanapp/lexical-model-compiler/build-trie
* as this is for compatibility for models built
* BEFORE the searchTermToKey function was bundled with
* all models.
*
* This compatibility version lowercases AFTER removing diacritics;
* the new version (bundled in future models) lowercases,
* NFD normalizes, THEN removes diacritics.
*/
return wordform
.normalize('NFD')
// Remove all combining diacritics (if input is in NFD)
// common to Latin-orthographies.
.replace(/[\u0300-\u036f]/g, '')
.toLowerCase();
}
function getDefaultWordBreaker() {
var namespace;
// @ts-ignore
if (typeof wordBreakers !== 'undefined') {
// @ts-ignore
namespace = wordBreakers;
}
else {
namespace = require('@keymanapp/models-wordBreakers').wordBreakers;
}
return namespace['default'];
}
})(models || (models = {}));
/// <reference path="./trie-model.ts" />
// Add all namespaces defined here to the global scope:
if (typeof module != 'undefined' && typeof module.exports != 'undefined') {
module.exports['models'] = models;
}
//# sourceMappingURL=index.js.map