UNPKG

@keymanapp/kmc-model

Version:

Keyman Developer lexical model compiler

147 lines 5.32 kB
/** * Interfaces and constants used by the lexical model compiler. These target * the LMLayer's internal worker code, so we provide those definitions too. */ import { LexicalModelTypes } from '@keymanapp/common-types'; import CasingFunction = LexicalModelTypes.CasingFunction; import LexicalModelPunctuation = LexicalModelTypes.LexicalModelPunctuation; import WordBreakingFunction = LexicalModelTypes.WordBreakingFunction; export interface LexicalModelDeclaration { readonly format: 'trie-1.0' | 'fst-foma-1.0' | 'custom-1.0'; } /** * @public * Keyman 14.0+ word breaker specification: * * Can support all old word breaking specification, * but can also be extended with options. * * @since 14.0 */ export interface WordBreakerSpec { readonly use: SimpleWordBreakerSpec; /** * If present, joins words that were split by the word breaker * together at the given strings. e.g., * * joinWordsAt: ['-'] // to keep hyphenated items together * * @since 14.0 */ readonly joinWordsAt?: string[]; /** * Overrides word splitting behaviour for certain scripts. * For example, specifing that spaces break words in certain South-East * Asian scripts that otherwise do not use spaces. * * @since 14.0 */ readonly overrideScriptDefaults?: OverrideScriptDefaults; } /** * @public * Simplified word breaker specification. * * @since 11.0 */ export type SimpleWordBreakerSpec = 'default' | 'ascii' | WordBreakingFunction; /** * @public * Simplifies input text to facilitate finding entries within a lexical model's * lexicon. * @since 11.0 */ export type SimpleWordformToKeySpec = (term: string) => string; /** * @public * Simplifies input text to facilitate finding entries within a lexical model's * lexicon, using the model's `applyCasing` function to assist in the keying process. * @since 14.0 */ export type CasedWordformToKeySpec = (term: string, applyCasing?: CasingFunction) => string; /** * @public * Simplifies input text to facilitate finding entries within a lexical model's * lexicon. */ export type WordformToKeySpec = SimpleWordformToKeySpec | CasedWordformToKeySpec; /** * Override the default word breaking behaviour for some scripts. * * There is currently only one option: * * 'break-words-at-spaces' * : some South-East Asian scripts conventionally do not use space or any * explicit word boundary character to write word breaks. These scripts are: * * * Burmese * * Khmer * * Thai * * Laos * * (this list may be incomplete and extended in the future) * * For these scripts, the default word breaker breaks at **every** * letter/syllable/ideograph. However, in languages that use these scripts BUT * use spaces (or some other delimier) as word breaks, enable * 'break-words-at-spaces'; enabling 'break-words-at-spaces' prevents the word * breaker from making too many breaks in these scripts. * * @since 14.0 */ export type OverrideScriptDefaults = 'break-words-at-spaces'; /** * @public * Base interface for a lexical model source definition */ export interface LexicalModelSource extends LexicalModelDeclaration { readonly sources: Array<string>; /** * The name of the type to instantiate (without parameters) as the base object for a custom predictive model. */ readonly rootClass?: string; /** * When set to `true`, suggestions will attempt to match the case of the input text even if * the lexicon entries use a different casing scheme due to search term keying effects. * @since 14.0 */ readonly languageUsesCasing?: boolean; /** * Specifies the casing rules for a language. Should implement three casing forms: * - 'lower' -- a fully-lowercased version of the text appropriate for the language's * use of the writing system. * - 'upper' -- a fully-uppercased version of the text * - 'initial' -- a version preserving the input casing aside from the initial character, * which is uppercased (like with proper nouns and sentence-initial words in English * sentences.) * * This is only utilized if `languageUsesCasing` is defined and set to `true`. * @since 14.0 */ readonly applyCasing?: CasingFunction; /** * Which word breaker to use. Choose from: * * - 'default' -- breaks according to Unicode UAX #29 §4.1 Default Word * Boundary Specification, which works well for *most* languages. * - 'ascii' -- a very simple word breaker, for demonstration purposes only. * - word breaking function -- provide your own function that breaks words. * - class-based word-breaker - may be supported in the future. */ readonly wordBreaker?: WordBreakerSpec | SimpleWordBreakerSpec; /** * How to simplify words, to convert them into simplified search keys * This often involves removing accents, lowercasing, etc. */ readonly searchTermToKey?: WordformToKeySpec; /** * Punctuation and spacing suggested by the model. * * @see LexicalModelPunctuation */ readonly punctuation?: LexicalModelPunctuation; } export interface LexicalModelCompiled extends LexicalModelDeclaration { readonly id: string; } //# sourceMappingURL=lexical-model.d.ts.map