@keymanapp/kmc-model
Version:
Keyman Developer lexical model compiler
382 lines (380 loc) • 14.8 kB
JavaScript
!function(){try{var e="undefined"!=typeof window?window:"undefined"!=typeof global?global:"undefined"!=typeof self?self:{},n=(new Error).stack;n&&(e._sentryDebugIds=e._sentryDebugIds||{},e._sentryDebugIds[n]="5d538b78-2f2f-58ad-9309-85d4bf5b2e1e")}catch(e){}}();
import { ModelCompilerError, ModelCompilerMessageContext, ModelCompilerMessages } from "./model-compiler-messages.js";
import { callbacks } from "./compiler-callbacks.js";
// Supports LF or CRLF line terminators.
const NEWLINE_SEPARATOR = /\u000d?\u000a/;
/**
* Returns a data structure that can be loaded by the TrieModel.
*
* It implements a **weighted** trie, whose indices (paths down the trie) are
* generated by a search key, and not concrete wordforms themselves.
*
* @param sourceFiles an array of source files that will be read to generate the trie.
*/
export function createTrieDataStructure(filenames, searchTermToKey) {
if (typeof searchTermToKey !== "function") {
throw new ModelCompilerError(ModelCompilerMessages.Error_SearchTermToKeyMustBeExplicitlySpecified());
}
// Make one big word list out of all of the filenames provided.
let wordlist = {};
filenames.forEach(filename => parseWordListFromFilename(wordlist, filename));
let trie = Trie.buildTrie(wordlist, searchTermToKey);
return JSON.stringify(trie);
}
/**
* Parses a word list from a file, merging duplicate entries.
*
* The word list may be encoded in:
*
* - UTF-8, with or without BOM [exported by most software]
* - UTF-16, little endian, with BOM [exported by Microsoft Excel]
*
* @param wordlist word list to merge entries into (may have existing entries)
* @param filename filename of the word list
*/
export function parseWordListFromFilename(wordlist, filename) {
ModelCompilerMessageContext.filename = filename;
_parseWordList(wordlist, new WordListFromFilename(filename));
}
/**
* Parses a word list from a string. The string should have multiple lines
* with LF or CRLF line terminators.
*
* @param wordlist word list to merge entries into (may have existing entries)
* @param filename filename of the word list
*/
export function parseWordListFromContents(wordlist, contents) {
ModelCompilerMessageContext.filename = undefined;
_parseWordList(wordlist, new WordListFromMemory(contents));
}
/**
* Reads a tab-separated values file into a word list. This function converts all
* entries into NFC and merges duplicate entries across wordlists. Duplication is
* on the basis of character-for-character equality after normalisation to NFC.
*
* Format specification:
*
* - the file is a UTF-8 encoded text file.
* - new lines are either LF or CRLF.
* - the file MAY start with the UTF-8 byte-order mark (BOM); that is, if the
* first three bytes of the file are EF BB BF, these will be interepreted as
* the BOM and will be ignored.
* - the file either consists of a comment or an entry.
* - comment lines MUST start with the '#' character on the very first column.
* - entries are one to three columns, separated by the (horizontal) tab
* character.
* - column 1 (REQUIRED): the wordform: can have any character except tab, CR,
* LF. Surrounding whitespace characters are trimmed.
* - column 2 (optional): the count: a non-negative integer specifying how many
* times this entry has appeared in the corpus. Blank means 'indeterminate';
* commas are permissible in the digits.
* - column 3 (optional): comment: an informative comment, ignored by the tool.
*
* @param wordlist word list to merge entries into (may have existing entries)
* @param contents contents of the file to import
*/
function _parseWordList(wordlist, source) {
const TAB = "\t";
let wordsSeenInThisFile = new Set();
for (let [lineno, line] of source.lines()) {
ModelCompilerMessageContext.line = lineno;
// Remove the byte-order mark (BOM) from the beginning of the string.
// Because `contents` can be the concatenation of several files, we have to remove
// the BOM from every possible start of file -- i.e., beginning of every line.
line = line.replace(/^\uFEFF/, '').trim();
if (line.startsWith('#') || line === "") {
continue; // skip comments and empty lines
}
// The third column is the comment. Always ignored!
let [wordform, countText] = line.split(TAB);
// Clean the word form.
let original = wordform;
wordform = wordform.normalize('NFC');
if (original !== wordform) {
// Mixed normalization forms are yucky! Hint about it, because it may
// result in unexpected counts where multiple normalization forms for one
// word
callbacks.reportMessage(ModelCompilerMessages.Hint_MixedNormalizationForms({ wordform: wordform }));
}
wordform = wordform.trim();
countText = (countText || '').trim().replace(/,/g, '');
let count = parseInt(countText, 10);
// When parsing a decimal integer fails (e.g., blank or something else):
if (!isFinite(count) || count < 0) {
// TODO: is this the right thing to do?
// Treat it like a hapax legonmenom -- it exist, but only once.
count = 1;
}
if (wordsSeenInThisFile.has(wordform)) {
// The same word seen across multiple files is fine, but a word seen
// multiple times in one file may be a problem
callbacks.reportMessage(ModelCompilerMessages.Hint_DuplicateWordInSameFile({ wordform: wordform }));
}
wordsSeenInThisFile.add(wordform);
wordlist[wordform] = (isNaN(wordlist[wordform]) ? 0 : wordlist[wordform] || 0) + count;
}
}
class WordListFromMemory {
name = '<memory>';
_contents;
constructor(contents) {
this._contents = contents;
}
*lines() {
yield* enumerateLines(this._contents.split(NEWLINE_SEPARATOR));
}
}
class WordListFromFilename {
name;
constructor(filename) {
this.name = filename;
}
*lines() {
const data = callbacks.loadFile(this.name);
if (!data) {
throw new ModelCompilerError(ModelCompilerMessages.Error_WordlistFileNotFound({ filename: this.name }));
}
const contents = new TextDecoder(detectEncoding(data)).decode(data);
yield* enumerateLines(contents.split(NEWLINE_SEPARATOR));
}
}
/**
* Yields pairs of [lineno, line], given an Array of lines.
*/
function* enumerateLines(lines) {
let i = 1;
for (let line of lines) {
yield [i, line];
i++;
}
}
var Trie;
(function (Trie_1) {
/**
* A sentinel value for when an internal node has contents and requires an
* "internal" leaf. That is, this internal node has content. Instead of placing
* entries as children in an internal node, a "fake" leaf is created, and its
* key is this special internal value.
*
* The value is a valid Unicode BMP code point, but it is a "non-character".
* Unicode will never assign semantics to these characters, as they are
* intended to be used internally as sentinel values.
*/
const INTERNAL_VALUE = '\uFDD0';
/**
* Builds a trie from a word list.
*
* @param wordlist The wordlist with non-negative weights.
* @param keyFunction Function that converts word forms into indexed search keys
* @returns A JSON-serialiable object that can be given to the TrieModel constructor.
*/
function buildTrie(wordlist, keyFunction) {
let root = new Trie(keyFunction).buildFromWordList(wordlist).root;
return {
totalWeight: sumWeights(root),
root: root
};
}
Trie_1.buildTrie = buildTrie;
/**
* Wrapper class for the trie and its nodes and wordform to search
*/
class Trie {
root = createRootNode();
toKey;
constructor(wordform2key) {
this.toKey = wordform2key;
}
/**
* Populates the trie with the contents of an entire wordlist.
* @param words a list of word and count pairs.
*/
buildFromWordList(words) {
for (let [wordform, weight] of Object.entries(words)) {
let key = this.toKey(wordform);
addUnsorted(this.root, { key, weight, content: wordform }, 0);
}
sortTrie(this.root);
return this;
}
}
// "Constructors"
function createRootNode() {
return {
type: 'leaf',
weight: 0,
entries: []
};
}
// Implement Trie creation.
/**
* Adds an entry to the trie.
*
* Note that the trie will likely be unsorted after the add occurs. Before
* performing a lookup on the trie, use call sortTrie() on the root note!
*
* @param node Which node should the entry be added to?
* @param entry the wordform/weight/key to add to the trie
* @param index the index in the key and also the trie depth. Should be set to
* zero when adding onto the root node of the trie.
*/
function addUnsorted(node, entry, index = 0) {
// Each node stores the MAXIMUM weight out of all of its decesdents, to
// enable a greedy search through the trie.
node.weight = Math.max(node.weight, entry.weight);
// When should a leaf become an interior node?
// When it already has a value, but the key of the current value is longer
// than the prefix.
if (node.type === 'leaf' && index < entry.key.length && node.entries.length >= 1) {
convertLeafToInternalNode(node, index);
}
if (node.type === 'leaf') {
// The key matches this leaf node, so add yet another entry.
addItemToLeaf(node, entry);
}
else {
// Push the node down to a lower node.
addItemToInternalNode(node, entry, index);
}
node.unsorted = true;
}
/**
* Adds an item to the internal node at a given depth.
* @param item
* @param index
*/
function addItemToInternalNode(node, item, index) {
let char = item.key[index];
// If an internal node is the proper site for item, it belongs under the
// corresponding (sentinel, internal-use) child node signifying this.
if (char == undefined) {
char = INTERNAL_VALUE;
}
if (!node.children[char]) {
node.children[char] = createRootNode();
node.values.push(char);
}
addUnsorted(node.children[char], item, index + 1);
}
function addItemToLeaf(leaf, item) {
leaf.entries.push(item);
}
/**
* Mutates the given Leaf to turn it into an InternalNode.
*
* NOTE: the node passed in will be DESTRUCTIVELY CHANGED into a different
* type when passed into this function!
*
* @param depth depth of the trie at this level.
*/
function convertLeafToInternalNode(leaf, depth) {
let entries = leaf.entries;
// Alias the current node, as the desired type.
let internal = leaf;
internal.type = 'internal';
delete leaf.entries;
internal.values = [];
internal.children = {};
// Convert the old values array into the format for interior nodes.
for (let item of entries) {
let char;
if (depth < item.key.length) {
char = item.key[depth];
}
else {
char = INTERNAL_VALUE;
}
if (!internal.children[char]) {
internal.children[char] = createRootNode();
internal.values.push(char);
}
addUnsorted(internal.children[char], item, depth + 1);
}
internal.unsorted = true;
}
/**
* Recursively sort the trie, in descending order of weight.
* @param node any node in the trie
*/
function sortTrie(node) {
if (node.type === 'leaf') {
if (!node.unsorted) {
return;
}
node.entries.sort(function (a, b) { return b.weight - a.weight; });
}
else {
// We MUST recurse and sort children before returning.
for (let char of node.values) {
sortTrie(node.children[char]);
}
if (!node.unsorted) {
return;
}
node.values.sort((a, b) => {
return node.children[b].weight - node.children[a].weight;
});
}
delete node.unsorted;
}
/**
* O(n) recursive traversal to sum the total weight of all leaves in the
* trie, starting at the provided node.
*
* @param node The node to start summing weights.
*/
function sumWeights(node) {
let val;
if (node.type === 'leaf') {
val = node.entries
.map(entry => entry.weight)
//.map(entry => isNaN(entry.weight) ? 1 : entry.weight)
.reduce((acc, count) => acc + count, 0);
}
else {
val = Object.keys(node.children)
.map((key) => sumWeights(node.children[key]))
.reduce((acc, count) => acc + count, 0);
}
if (isNaN(val)) {
throw new Error("Unexpected NaN has appeared!");
}
return val;
}
})(Trie || (Trie = {}));
/**
* Detects the encoding of a text file.
*
* Supported encodings are:
*
* - UTF-8, with or without BOM
* - UTF-16, little endian, with BOM
*
* UTF-16 in big endian is explicitly NOT supported! The reason is two-fold:
* 1) Node does not support it without resorting to an external library (or
* swapping every byte in the file!); and 2) I'm not sure anything actually
* outputs in this format anyway!
*
* @param filename filename of the file to detect encoding
*/
function detectEncoding(buffer) {
if (buffer.length < 2) {
return 'utf-8';
}
// Note: BOM is U+FEFF
// In little endian, this is 0xFF 0xFE
if (buffer[0] == 0xFF && buffer[1] == 0xFE) {
return 'utf-16le';
}
else if (buffer[0] == 0xFE && buffer[1] == 0xFF) {
// Big Endian, is NOT supported because Node does not support it (???)
// See: https://stackoverflow.com/a/14551669/6626414
throw new ModelCompilerError(ModelCompilerMessages.Error_UTF16BEUnsupported());
}
else {
// Assume its in UTF-8, with or without a BOM.
return 'utf-8';
}
}
//# sourceMappingURL=build-trie.js.map
//# debugId=5d538b78-2f2f-58ad-9309-85d4bf5b2e1e