UNPKG

@keymanapp/kmc-model

Version:

Keyman Developer lexical model compiler

github.com/keymanapp/keyman

keymanapp/keyman

382 lines (380 loc) • 14.8 kB

JavaScript

!function(){try{var e="undefined"!=typeof window?window:"undefined"!=typeof global?global:"undefined"!=typeof self?self:{},n=(new Error).stack;n&&(e._sentryDebugIds=e._sentryDebugIds||{},e._sentryDebugIds[n]="5d538b78-2f2f-58ad-9309-85d4bf5b2e1e")}catch(e){}}(); import { ModelCompilerError, ModelCompilerMessageContext, ModelCompilerMessages } from "./model-compiler-messages.js"; import { callbacks } from "./compiler-callbacks.js"; // Supports LF or CRLF line terminators. const NEWLINE_SEPARATOR = /\u000d?\u000a/; /** * Returns a data structure that can be loaded by the TrieModel. * * It implements a **weighted** trie, whose indices (paths down the trie) are * generated by a search key, and not concrete wordforms themselves. * * @param sourceFiles an array of source files that will be read to generate the trie. */ export function createTrieDataStructure(filenames, searchTermToKey) { if (typeof searchTermToKey !== "function") { throw new ModelCompilerError(ModelCompilerMessages.Error_SearchTermToKeyMustBeExplicitlySpecified()); } // Make one big word list out of all of the filenames provided. let wordlist = {}; filenames.forEach(filename => parseWordListFromFilename(wordlist, filename)); let trie = Trie.buildTrie(wordlist, searchTermToKey); return JSON.stringify(trie); } /** * Parses a word list from a file, merging duplicate entries. * * The word list may be encoded in: * * - UTF-8, with or without BOM [exported by most software] * - UTF-16, little endian, with BOM [exported by Microsoft Excel] * * @param wordlist word list to merge entries into (may have existing entries) * @param filename filename of the word list */ export function parseWordListFromFilename(wordlist, filename) { ModelCompilerMessageContext.filename = filename; _parseWordList(wordlist, new WordListFromFilename(filename)); } /** * Parses a word list from a string. The string should have multiple lines * with LF or CRLF line terminators. * * @param wordlist word list to merge entries into (may have existing entries) * @param filename filename of the word list */ export function parseWordListFromContents(wordlist, contents) { ModelCompilerMessageContext.filename = undefined; _parseWordList(wordlist, new WordListFromMemory(contents)); } /** * Reads a tab-separated values file into a word list. This function converts all * entries into NFC and merges duplicate entries across wordlists. Duplication is * on the basis of character-for-character equality after normalisation to NFC. * * Format specification: * * - the file is a UTF-8 encoded text file. * - new lines are either LF or CRLF. * - the file MAY start with the UTF-8 byte-order mark (BOM); that is, if the * first three bytes of the file are EF BB BF, these will be interepreted as * the BOM and will be ignored. * - the file either consists of a comment or an entry. * - comment lines MUST start with the '#' character on the very first column. * - entries are one to three columns, separated by the (horizontal) tab * character. * - column 1 (REQUIRED): the wordform: can have any character except tab, CR, * LF. Surrounding whitespace characters are trimmed. * - column 2 (optional): the count: a non-negative integer specifying how many * times this entry has appeared in the corpus. Blank means 'indeterminate'; * commas are permissible in the digits. * - column 3 (optional): comment: an informative comment, ignored by the tool. * * @param wordlist word list to merge entries into (may have existing entries) * @param contents contents of the file to import */ function _parseWordList(wordlist, source) { const TAB = "\t"; let wordsSeenInThisFile = new Set(); for (let [lineno, line] of source.lines()) { ModelCompilerMessageContext.line = lineno; // Remove the byte-order mark (BOM) from the beginning of the string. // Because `contents` can be the concatenation of several files, we have to remove // the BOM from every possible start of file -- i.e., beginning of every line. line = line.replace(/^\uFEFF/, '').trim(); if (line.startsWith('#') || line === "") { continue; // skip comments and empty lines } // The third column is the comment. Always ignored! let [wordform, countText] = line.split(TAB); // Clean the word form. let original = wordform; wordform = wordform.normalize('NFC'); if (original !== wordform) { // Mixed normalization forms are yucky! Hint about it, because it may // result in unexpected counts where multiple normalization forms for one // word callbacks.reportMessage(ModelCompilerMessages.Hint_MixedNormalizationForms({ wordform: wordform })); } wordform = wordform.trim(); countText = (countText || '').trim().replace(/,/g, ''); let count = parseInt(countText, 10); // When parsing a decimal integer fails (e.g., blank or something else): if (!isFinite(count) || count < 0) { // TODO: is this the right thing to do? // Treat it like a hapax legonmenom -- it exist, but only once. count = 1; } if (wordsSeenInThisFile.has(wordform)) { // The same word seen across multiple files is fine, but a word seen // multiple times in one file may be a problem callbacks.reportMessage(ModelCompilerMessages.Hint_DuplicateWordInSameFile({ wordform: wordform })); } wordsSeenInThisFile.add(wordform); wordlist[wordform] = (isNaN(wordlist[wordform]) ? 0 : wordlist[wordform] || 0) + count; } } class WordListFromMemory { name = '<memory>'; _contents; constructor(contents) { this._contents = contents; } *lines() { yield* enumerateLines(this._contents.split(NEWLINE_SEPARATOR)); } } class WordListFromFilename { name; constructor(filename) { this.name = filename; } *lines() { const data = callbacks.loadFile(this.name); if (!data) { throw new ModelCompilerError(ModelCompilerMessages.Error_WordlistFileNotFound({ filename: this.name })); } const contents = new TextDecoder(detectEncoding(data)).decode(data); yield* enumerateLines(contents.split(NEWLINE_SEPARATOR)); } } /** * Yields pairs of [lineno, line], given an Array of lines. */ function* enumerateLines(lines) { let i = 1; for (let line of lines) { yield [i, line]; i++; } } var Trie; (function (Trie_1) { /** * A sentinel value for when an internal node has contents and requires an * "internal" leaf. That is, this internal node has content. Instead of placing * entries as children in an internal node, a "fake" leaf is created, and its * key is this special internal value. * * The value is a valid Unicode BMP code point, but it is a "non-character". * Unicode will never assign semantics to these characters, as they are * intended to be used internally as sentinel values. */ const INTERNAL_VALUE = '\uFDD0'; /** * Builds a trie from a word list. * * @param wordlist The wordlist with non-negative weights. * @param keyFunction Function that converts word forms into indexed search keys * @returns A JSON-serialiable object that can be given to the TrieModel constructor. */ function buildTrie(wordlist, keyFunction) { let root = new Trie(keyFunction).buildFromWordList(wordlist).root; return { totalWeight: sumWeights(root), root: root }; } Trie_1.buildTrie = buildTrie; /** * Wrapper class for the trie and its nodes and wordform to search */ class Trie { root = createRootNode(); toKey; constructor(wordform2key) { this.toKey = wordform2key; } /** * Populates the trie with the contents of an entire wordlist. * @param words a list of word and count pairs. */ buildFromWordList(words) { for (let [wordform, weight] of Object.entries(words)) { let key = this.toKey(wordform); addUnsorted(this.root, { key, weight, content: wordform }, 0); } sortTrie(this.root); return this; } } // "Constructors" function createRootNode() { return { type: 'leaf', weight: 0, entries: [] }; } // Implement Trie creation. /** * Adds an entry to the trie. * * Note that the trie will likely be unsorted after the add occurs. Before * performing a lookup on the trie, use call sortTrie() on the root note! * * @param node Which node should the entry be added to? * @param entry the wordform/weight/key to add to the trie * @param index the index in the key and also the trie depth. Should be set to * zero when adding onto the root node of the trie. */ function addUnsorted(node, entry, index = 0) { // Each node stores the MAXIMUM weight out of all of its decesdents, to // enable a greedy search through the trie. node.weight = Math.max(node.weight, entry.weight); // When should a leaf become an interior node? // When it already has a value, but the key of the current value is longer // than the prefix. if (node.type === 'leaf' && index < entry.key.length && node.entries.length >= 1) { convertLeafToInternalNode(node, index); } if (node.type === 'leaf') { // The key matches this leaf node, so add yet another entry. addItemToLeaf(node, entry); } else { // Push the node down to a lower node. addItemToInternalNode(node, entry, index); } node.unsorted = true; } /** * Adds an item to the internal node at a given depth. * @param item * @param index */ function addItemToInternalNode(node, item, index) { let char = item.key[index]; // If an internal node is the proper site for item, it belongs under the // corresponding (sentinel, internal-use) child node signifying this. if (char == undefined) { char = INTERNAL_VALUE; } if (!node.children[char]) { node.children[char] = createRootNode(); node.values.push(char); } addUnsorted(node.children[char], item, index + 1); } function addItemToLeaf(leaf, item) { leaf.entries.push(item); } /** * Mutates the given Leaf to turn it into an InternalNode. * * NOTE: the node passed in will be DESTRUCTIVELY CHANGED into a different * type when passed into this function! * * @param depth depth of the trie at this level. */ function convertLeafToInternalNode(leaf, depth) { let entries = leaf.entries; // Alias the current node, as the desired type. let internal = leaf; internal.type = 'internal'; delete leaf.entries; internal.values = []; internal.children = {}; // Convert the old values array into the format for interior nodes. for (let item of entries) { let char; if (depth < item.key.length) { char = item.key[depth]; } else { char = INTERNAL_VALUE; } if (!internal.children[char]) { internal.children[char] = createRootNode(); internal.values.push(char); } addUnsorted(internal.children[char], item, depth + 1); } internal.unsorted = true; } /** * Recursively sort the trie, in descending order of weight. * @param node any node in the trie */ function sortTrie(node) { if (node.type === 'leaf') { if (!node.unsorted) { return; } node.entries.sort(function (a, b) { return b.weight - a.weight; }); } else { // We MUST recurse and sort children before returning. for (let char of node.values) { sortTrie(node.children[char]); } if (!node.unsorted) { return; } node.values.sort((a, b) => { return node.children[b].weight - node.children[a].weight; }); } delete node.unsorted; } /** * O(n) recursive traversal to sum the total weight of all leaves in the * trie, starting at the provided node. * * @param node The node to start summing weights. */ function sumWeights(node) { let val; if (node.type === 'leaf') { val = node.entries .map(entry => entry.weight) //.map(entry => isNaN(entry.weight) ? 1 : entry.weight) .reduce((acc, count) => acc + count, 0); } else { val = Object.keys(node.children) .map((key) => sumWeights(node.children[key])) .reduce((acc, count) => acc + count, 0); } if (isNaN(val)) { throw new Error("Unexpected NaN has appeared!"); } return val; } })(Trie || (Trie = {})); /** * Detects the encoding of a text file. * * Supported encodings are: * * - UTF-8, with or without BOM * - UTF-16, little endian, with BOM * * UTF-16 in big endian is explicitly NOT supported! The reason is two-fold: * 1) Node does not support it without resorting to an external library (or * swapping every byte in the file!); and 2) I'm not sure anything actually * outputs in this format anyway! * * @param filename filename of the file to detect encoding */ function detectEncoding(buffer) { if (buffer.length < 2) { return 'utf-8'; } // Note: BOM is U+FEFF // In little endian, this is 0xFF 0xFE if (buffer[0] == 0xFF && buffer[1] == 0xFE) { return 'utf-16le'; } else if (buffer[0] == 0xFE && buffer[1] == 0xFF) { // Big Endian, is NOT supported because Node does not support it (???) // See: https://stackoverflow.com/a/14551669/6626414 throw new ModelCompilerError(ModelCompilerMessages.Error_UTF16BEUnsupported()); } else { // Assume its in UTF-8, with or without a BOM. return 'utf-8'; } } //# sourceMappingURL=build-trie.js.map //# debugId=5d538b78-2f2f-58ad-9309-85d4bf5b2e1e