UNPKG

@keymanapp/kmc-model

Version:

Keyman Developer lexical model compiler

102 lines 5.06 kB
/** * Converts wordforms into an indexable form. It does this by * normalizing the letter case of characters INDIVIDUALLY (to disregard * context-sensitive case transformations), normalizing to NFKD form, * and removing common diacritical marks. * * This is a very speculative implementation, that might work with * your language. We don't guarantee that this will be perfect for your * language, but it's a start. * * This uses String.prototype.normalize() to convert normalize into NFKD. * NFKD neutralizes some funky distinctions, e.g., ꬲ, e, e should all be the * same character; plus, it's an easy way to separate a Latin character from * its diacritics; Even then, orthographies regularly use code points * that, under NFKD normalization, do NOT decompose appropriately for your * language (e.g., SENĆOŦEN, Plains Cree in syllabics). * * Use this in early iterations of the model. For a production lexical model, * you will probably write/generate your own key function, tailored to your * language. There is a chance the default will work properly out of the box. */ export function defaultSearchTermToKey(wordform) { return wordform .normalize('NFKD') // Remove any combining diacritics (if input is in NFKD) .replace(/[\u0300-\u036F]/g, '') // Replace directional quotation marks with plain apostrophes .replace(/[‘’]/g, "'") // Also double-quote marks. .replace(/[“”]/g, '"'); } /** * Converts wordforms into an indexable form. It does this by * normalizing the letter case of characters INDIVIDUALLY (to disregard * context-sensitive case transformations), normalizing to NFKD form, * and removing common diacritical marks. * * This is a very speculative implementation, that might work with * your language. We don't guarantee that this will be perfect for your * language, but it's a start. * * This uses String.prototype.normalize() to convert normalize into NFKD. * NFKD neutralizes some funky distinctions, e.g., ꬲ, e, e should all be the * same character; plus, it's an easy way to separate a Latin character from * its diacritics; Even then, orthographies regularly use code points * that, under NFKD normalization, do NOT decompose appropriately for your * language (e.g., SENĆOŦEN, Plains Cree in syllabics). * * Use this in early iterations of the model. For a production lexical model, * you will probably write/generate your own key function, tailored to your * language. There is a chance the default will work properly out of the box. */ export function defaultCasedSearchTermToKey(wordform, applyCasing) { // While this is a bit WET, as the basic `defaultSearchTermToKey` exists and performs some of // the same functions, repetition is the easiest way to allow the function to be safely compiled // with ease by use of `.toString()`. return Array.from(wordform .normalize('NFKD') // Remove any combining diacritics (if input is in NFKD) .replace(/[\u0300-\u036F]/g, '')) // end of `Array.from` .map(function (c) { return applyCasing('lower', c); }) .join('') // Replace directional quotation marks with plain apostrophes .replace(/[‘’]/g, "'") // Also double-quote marks. .replace(/[“”]/g, '"'); } /** * Specifies default casing behavior for lexical models when `languageUsesCasing` is * set to true. * @param casing One of 'lower' (lowercased), 'upper' (uppercased), or 'initial'. * * 'initial' is designed to cover cases like sentence-initial & proper noun capitalization in English. * This may be overwritten as appropriate in model-specific implementations. * @param text The text to be modified. */ export function defaultApplyCasing(casing, text) { switch (casing) { case 'lower': return text.toLowerCase(); case 'upper': return text.toUpperCase(); case 'initial': var headCode = text.charCodeAt(0); // The length of the first code unit, as measured in code points. var headUnitLength = 1; // Is the first character a high surrogate, indicating possible use of UTF-16 // surrogate pairs? Also, is the string long enough for there to BE a pair? if (text.length > 1 && headCode >= 0xD800 && headCode <= 0xDBFF) { // It's possible, so now we check for low surrogates. var lowSurrogateCode = text.charCodeAt(1); if (lowSurrogateCode >= 0xDC00 && lowSurrogateCode <= 0xDFFF) { // We have a surrogate pair; this pair is the 'first' character. headUnitLength++; } } // Capitalizes the first code unit of the string, leaving the rest intact. return text.substring(0, headUnitLength).toUpperCase() // head - uppercased .concat(text.substring(headUnitLength)); // tail - lowercased } } //# sourceMappingURL=model-defaults.js.map