UNPKG

@keymanapp/models-types

Version:

Type definitions in used in the modeling (lexical model/predictive text) component of Keyman.

571 lines (515 loc) 21.1 kB
/** * TypeScript interfaces and types required in both within the LMLayer, and for * tools that create lexical models. */ /****************************** Lexical Models ******************************/ /** * A JavaScript string with the restriction that it must only * contain Unicode scalar values. * * This means that any lone high surrogate must be paired with * a low surrogate, if it exists. Lone surrogate code units are * forbidden. * * See also: https://developer.mozilla.org/en-US/docs/Web/API/USVString */ declare type USVString = string; declare type CasingForm = 'lower' | 'initial' | 'upper'; /** * Used to facilitate edit-distance calculations by allowing the LMLayer to * efficiently search the model's lexicon in a Trie-like manner. */ declare interface LexiconTraversal { /** * Provides an iterable pattern used to search for words with a prefix matching * the current traversal state's prefix when a new character is appended. Iterating * across `children` provides 'breadth' to a lexical search. * * For an example with English, if the current traversal state corresponds to 'th', * children() may return an iterator with states corresponding 'e' * (for 'the', 'then', 'there'), 'a' (for 'than', 'that'), etc. * * @param char A full character (a UTF-16 code point, which may be comprised of two * code units) corresponding to the child node, appended to the current * node's prefix to produce the child node's prefix. * <p> * Example: if the current traversal's represented prefix is 'th', * char = 'e' would indicate a child node with prefix = 'the'. * @param traversal a LexiconTraversal starting from (or "rooted at") the child node. Use * of the returned object provides 'depth' to a lexical search. * <p> * Example: * * - Suppose our current LexiconTraversal represents a prefix of 'th'. * - If `char` = 'e', the child represents a prefix of 'the'. * - Then `traversal` allows traversing the part of the lexicon prefixed by 'the'. */ children(): Generator<{char: USVString, traversal: () => LexiconTraversal}>; /** * Any entries directly keyed by the currently-represented lookup prefix. Entries and * children may exist simultaneously, but `entries` must always exist when no children are * available in the returned `children()` iterable. * * Examples for English: * - search prefix of 'th': [] - an empty array. 'th' is not a valid English word. * - search prefix of 'the': ['the']. 'then' and 'there' may also exist within the lexicon, * but they most directly belong to deeper traversal states. * * May contain multiple children if a lexical model performs 'keying' operations, such as * may result from stripping accent markers from Spanish or French. In this case, all entries * transformed to the same 'key' should be listed by their key's traversal node. * * Example using French "accent homographs", where keying operations strip accents: * * - prefix of 'acre': ['acre', 'âcre'] * - prefix of 'crepe': ['crêpe', 'crêpé'] * - other examples: https://www.thoughtco.com/french-accent-homographs-1371072 */ entries: USVString[]; } /** * The model implementation, within the Worker. */ declare interface LexicalModel { /** * Processes `config` messages, configuring the newly-loaded model based on the host * platform's capability restrictions. * * This allows the model to configure its suggestions according to what the platform * allows the host to actually perform - for example, if post-caret deletions are not * supported, no suggestions requiring this feature should be produced by the model. * * Returns a `Configuration` object detailing the capabilities the model plans to * actually utilize, which must be as restrictive or more restrictive than those * indicated within the provided `Capabilities` object. * @param capabilities */ configure(capabilities: Capabilities): Configuration; /** * Indicates that the language represented by the lexical model has syntactic casing * behaviors. Setting this to true will allow the predictive text engine to * perform casing-based corrections and predictions. * * If set to `false`, the default behavior for the `toKey` method will not perform * casing modifications and will thus yield case-sensitive results. This will not occur * if `undefined` for backward-compatibility reasons. */ readonly languageUsesCasing?: boolean; /** * Represents casing-related syntactical behaviors of the language represented by * this lexical model, modifying input text to follow the specified casing pattern. * * Implementations may assume that the text represents a single word / 'token' from the * context. * * Patterns: * - lower: all case-sensitive characters should be lowercased. Example: "text123" * - upper: all case-sensitive characters should be uppercased. Example: "TEXT123" * - initial: only the word-initial character should be uppercased. * @param form * @param text */ applyCasing?(form: CasingForm, text: string): string /** * Indicates a mapping function used by the model to simplify lookup operations * within the lexicon. This is expected to result in a many-to-one mapping, transforming * the input text into a common, simplified 'index'/'key' form shared by all * text forms that a person might reasonably interpret as "the same". * * Example usages: * - converting any upper-case characters into lowercase. * - For English, 'CAT' and 'Cat' might be keyed as 'cat', since users expect all * three to be treated as the same word. * - removing accent marks that may be difficult to type on standard keyboard layouts * - For French, users may wish to type "jeune" instead of "jeûne" when lazy or * if accent marks cannot be easily input. * * Providing a function targetted for your language can greatly improve a user's experience * using your dictionary. * @param text The original input text. * @returns The 'keyed' form of that text. */ toKey?(text: USVString): USVString; /** * Generates predictive suggestions corresponding to the state of context after the proposed * transform is applied to it. * * This method should NOT attempt to perform any form of correction; this is modeled within a * separate component of the LMLayer predictive engine. That is, "th" + "e" should not be * have "this" for a suggestion ("e" has been 'corrected' to "i"), while "there" would be * a reasonable prediction. * * However, addition of diacritics to characters (which may transform the underlying char code * when Unicode-normalized) is permitted. For example, "pur" + "e" may reasonably predict * "purée", where "e" has been transformed to "é" as part of the suggestion. When possible, * it is recommended to accomplish this by defining a `toKey` (`searchTermToKey` in model * source) instead. * * When both prediction and correction are permitted, said component (the `ModelCompositor`) will * generally call this method once per 'likely' generated corrected state of the context, * utilizing the results to compute an overall likelihood across all possible suggestions. * @param transform A Transform corresponding to a recent input keystroke * @param context A depiction of the context to which `transform` is applied. * @returns A probability distribution (`Distribution<Suggestion>`) on the resulting `Suggestion` * space for use in determining the most optimal overall suggestions. */ predict(transform: Transform, context: Context): Distribution<Suggestion>; /** * Punctuation and presentational settings that the underlying lexical model * expects to be applied at higher levels. e.g., the ModelCompositor. * * @see LexicalModelPunctuation */ readonly punctuation?: LexicalModelPunctuation; /** * Returns the wordbreaker function defined for the model (if it exists). This * wordbreaker should operate on a plain JS `string`, fully tokenizing it into * separate words according to the syntactical rules of the modeled language. * * Needed to support many of the enhancements in 14.0, as enhanced wordbreaking / * tokenization is necessary for properly tracking possible "fat finger" inputs * and intermediate calculations (increasing prediction quality) and preventing * their misuse when starting new words. */ wordbreaker?: WordBreakingFunction; /** * Performs a wordbreak operation given the current context state, returning whatever word * or word fragment exists that starts before the caret but after the most recent whitespace * preceding the caret. If no such text exists, the empty string is returned. * * This function is designed for use in generating display text for 'keep' `Suggestions` * and display text for reverting any previously-applied `Suggestions`. * * ------------------ * * **NOTE: _Deprecated_** and replaced by `wordbreaker` in 14.0. You may still wish * to implement this function by reusing your `wordbreaker` definition if the model * may see use on Keyman 12.0 or 13.0, generally by returning `wordbreaker(context.left)`. * * As this function only tokenizes a single word from the context, it is insufficient for * supporting many of the predictive-text enhancements introduced in Keyman 14. Its * intermediate calculations are tracked on a per-word basis and the increased detail * provided by `wordbreaker` helps with stability, validating the engine's use of * the current context. * * @param context * @deprecated */ wordbreak?(context: Context): USVString; /** * Lexical models _may_ provide a LexiconTraversal object usable to enhance * prediction and correction results. The returned object represents the * unfiltered lexicon (with an empty prefix). */ traverseFromRoot?(): LexiconTraversal; } /** * Describes how to change a buffer at the cursor position. * first, you delete the specified amount amount from the left * and right, then you insert the provided text. */ declare interface Transform { /** * Facilitates use of unique identifiers for tracking the Transform and * any related data from its original source, as the reference cannot be * preserved across WebWorker boundaries. * * This is *separate* from any LMLayer-internal identification values. */ id?: number; /** * The Unicode scalar values (i.e., characters) to be inserted at the * cursor position. * * Corresponds to `s` in com.keyman.KeyboardInterface.output. */ insert: USVString; /** * The number of code units to delete to the left of the cursor. * * Corresponds to `dn` in com.keyman.KeyboardInterface.output. */ deleteLeft: number; /** * The number of code units to delete to the right of the cursor. * Not available on all platforms. */ deleteRight?: number; } /** * A concrete suggestion */ declare interface Suggestion { /** * Indicates the externally-supplied id of the Transform that prompted * the Suggestion. Automatically handled by the LMLayer; models should * not handle this field. */ transformId?: number; /** * A unique identifier for the Suggestion itself, not shared with any others - * even for Suggestions sourced from the same Transform. * * The lm-layer is responsible for setting this field, not models. */ id?: number; /** * The suggested update to the buffer. Note that this transform should * be applied AFTER the instigating transform, if any. */ readonly transform: Transform; /** * A string to display the suggestion to the typist. * This should aid the typist understand what the transform * will do to their text. * * When suggesting a word, `displayAs` should be that entire word. */ displayAs: string; /** * A single metalabel data describing the relation of the suggestion * to the input text. Ex: 'keep', 'emoji', 'correction', etc. */ tag?: SuggestionTag; } interface Reversion extends Suggestion { tag: 'revert'; } interface Keep extends Suggestion { tag: 'keep'; /** * Notes whether or not the Suggestion may actually be suggested by the model. * Should be `false` if the model does not actually predict the current text. */ matchesModel: boolean; } /** * A tag indicating the nature of the current suggestion. * * Tags include: * * 'keep' -- suggest the word as what was typed in the first place. * This tends to be presented as the leftmost suggestion. * @see LexicalModelPunctuation.quotesForKeepSuggestion * * 'correction' -- this suggests a correction to the current phrase * * 'emoji' -- replaces whatever is typed in with an appropriate emoji. * This tends to be presented as the rightmost suggestion. * * If left undefined, the consumers will assume this is a prediction. */ type SuggestionTag = undefined | 'keep' | 'revert' | 'correction' | 'emoji'; /** * The text and environment surrounding the insertion point (text cursor). */ declare interface Context { /** * Up to maxLeftContextCodeUnits code units of Unicode scalar value * (i. e., characters) to the left of the insertion point in the * buffer. If there is nothing to the left of the buffer, this is * an empty string. */ readonly left: USVString; /** * Up to maxRightContextCodeUnits code units of Unicode scalar value * (i. e., characters) to the right of the insertion point in the * buffer. If there is nothing to the right of the buffer, this is * an empty string. * * This property may be missing entirely. */ readonly right?: USVString; /** * Whether the insertion point is at the start of the buffer. */ readonly startOfBuffer: boolean; /** * Whether the insertion point is at the end of the buffer. */ readonly endOfBuffer: boolean; /** * The casing form to use based on the current layer of the touch layout. */ readonly casingForm?: CasingForm; } /** * Represents members of a probability distribution over potential outputs * from ambiguous text sequences. Designed for use with fat-finger correction * and similar typing ambiguities. */ interface ProbabilityMass<T> { /** * An individual sample from a Distribution over the same type. */ readonly sample: T; /** * The probability mass for this member of the distribution, * calculated devoid of any language-modeling influences. */ p: number; } declare type Distribution<T> = ProbabilityMass<T>[]; /** * A type augmented with an optional probability. */ type Outcome<T> = T & { /** * [optional] probability of this outcome. */ p?: number; }; /** * A type augmented with a probability. */ type WithOutcome<T> = T & { /** * Probability of this outcome. */ p: number; }; /******************************** Messaging ********************************/ /** * Describes the capabilities of the keyboard's platform. * This includes upper bounds for how much text will be sent on each * prediction, as well as what operations the keyboard is allowed to do on the * underlying buffer. */ declare interface Capabilities { /** * The maximum amount of UTF-16 code points that the keyboard will provide to * the left of the cursor, as an integer. */ readonly maxLeftContextCodePoints: number, /** * The maximum amount of code points that the keyboard will provide to the * right of the cursor, as an integer. The value 0 or the absence of this * rule implies that the right contexts are not supported. */ readonly maxRightContextCodePoints?: number, /** * Whether the platform supports deleting to the right. The absence of this * rule implies false. */ readonly supportsDeleteRight?: false, } /** * Configuration of the LMLayer, sent back to the keyboard. */ declare interface Configuration { /** * How many UTF-16 code units maximum to send as the context to the * left of the cursor ("left" in the Unicode character stream). * * Affects the `context` property sent in `predict` messages. * * While the left context MUST NOT bisect surrogate pairs, they MAY * bisect graphical clusters. */ leftContextCodePoints: number; /** deprecated; use `leftContextCodePoints` instead! */ leftContextCodeUnits?: number, /** * How many UTF-16 code units maximum to send as the context to the * right of the cursor ("right" in the Unicode character stream). * * Affects the `context` property sent in `predict` messages. * * While the right context MUST NOT bisect surrogate pairs, they MAY * bisect graphical clusters. */ rightContextCodePoints: number; /** deprecated; use `leftContextCodePoints` instead! */ rightContextCodeUnits?: number, /** * Whether or not the model appends characters to Suggestions for * wordbreaking purposes. (These characters need not be whitespace * or actual wordbreak characters.) * * If not specified, this will be auto-detected based on the model's * punctuation properties (if they exist). */ wordbreaksAfterSuggestions?: boolean } /****************************** Word breaking ******************************/ /** * A simple word breaking function takes a phrase, and splits it into "words", * for whatever definition of "word" is usable for the language model. * * For example: * * getText(breakWordsEnglish("Hello, world!")) == ["Hello", "world"] * getText(breakWordsCree("ᑕᐻ ᒥᔪ ᑮᓯᑲᐤ ᐊᓄᐦᐨ᙮")) == ["ᑕᐻ", "ᒥᔪ ᑮᓯᑲᐤ""", "ᐊᓄᐦᐨ"] * getText(breakWordsJapanese("英語を話せますか?")) == ["英語", "を", "話せます", "か"] * * Not all language models take in a configurable word breaking function. * * @returns an array of spans from the phrase, in order as they appear in the * phrase, each span which representing a word. */ declare interface WordBreakingFunction { // invariant: span[i].end <= span[i + 1].start // invariant: for all span[i] and span[i + 1], there does not exist a span[k] // where span[i].end <= span[k].start AND span[k].end <= span[i + 1].start (phrase: string): Span[]; } declare interface CasingFunction { (caseToApply: CasingForm, text: string, defaultApplyCasing?: CasingFunction): string; } /** * A span of text in a phrase. This is usually meant to represent words from a * pharse. */ declare interface Span { // invariant: start < end (empty spans not allowed) readonly start: number; // invariant: end > end (empty spans not allowed) readonly end: number; // invariant: length === end - start readonly length: number; // invariant: text.length === length // invariant: each character is BMP UTF-16 code unit, or is a high surrogate // UTF-16 code unit followed by a low surrogate UTF-16 code unit. readonly text: string; } /********************************** OTHER **********************************/ /** * Options for various punctuation to use in suggestions. */ interface LexicalModelPunctuation { /** * The quotes that appear in "keep" suggestions, e.g., keep what the user * typed verbatim. * * The keep suggestion is often the leftmost one, when suggested. * * [ “Hrllo” ] [ Hello ] [ Heck ] */ readonly quotesForKeepSuggestion: { /** * What will appear on the opening side of the quote. * (left side for LTR scripts; right side for RTL scripts) * * Default: `“` */ readonly open: string; /** * What will appear on the closing side of the quote. * (right side for LTR scripts; left side for RTL scripts) * * Default: `”` */ readonly close: string; }; /** * What punctuation or spacing to insert after every complete word * prediction. This can be set to the empty string when the script does not * use spaces to separate words. * * Default: ` ` */ readonly insertAfterWord: string; /** * Whether or not the model's language is typically displayed in RTL form. * * Default: false (or undefined) */ readonly isRTL?: boolean; }