cld3-asm
Version:
WebAssembly based Javascript bindings for google compact language detector 3
95 lines (94 loc) • 3.45 kB
TypeScript
export declare const UnknownLanguage: string;
export interface LanguageResult {
/**
* Detected language. {UnknownLanguage} if detection fails.
*/
language: string;
/**
* Language probability.
*/
probability: number;
/**
* Whether the prediction is reliable.
*/
is_reliable: boolean;
/**
* Proportion of bytes associated with the language. If FindLanguage is
* called, this variable is set to 1.
*/
proportion: number;
}
/**
* @internal
*/
export interface ResultVector {
size(): number;
get(index: number): LanguageResult;
}
/**
* @internal
*/
export interface NNetLanguageIdentifier {
/**
* Finds the most likely language for the given text, along with additional
* information (e.g., probability). The prediction is based on the first N
* bytes where N is the minumum between the number of interchange valid UTF8
* bytes and max_num_bytes_. If N is less than min_num_bytes_ long, then this
* function returns kUnknown.
*/
FindLanguage(text: string): LanguageResult;
/**
* Splits the input text (up to the first byte, if any, that is not
* interchange valid UTF8) into spans based on the script, predicts a language
* for each span, and returns a vector storing the top num_langs most frequent
* languages along with additional information (e.g., proportions). The number
* of bytes considered for each span is the minimum between the size of the
* span and max_num_bytes_. If more languages are requested than what is
* available in the input, then for those cases kUnknown is returned. Also, if
* the size of the span is less than min_num_bytes_ long, then the span is
* skipped. If the input text is too long, only the first
* kMaxNumInputBytesToConsider bytes are processed.
*/
FindTopNMostFreqLangs(text: string, numLangs: number): ResultVector;
/**
* Destroy instance of identifier
*/
delete(): void;
}
/**
* @internal
*
* Interface for module generated by emscripten to load wasm binary.
* https://kripken.github.io/emscripten-site/docs/api_reference/preamble.js.html
*/
export interface CldAsmModule {
NNetLanguageIdentifier: {
/**
* Min number of bytes needed to make a prediction if the default constructor
* is called.
*/
kMinNumBytesToConsider: number;
/**
* Max number of bytes to consider to make a prediction if the default
* constructor is called.
*/
kMaxNumBytesToConsider: number;
/**
* Max number of input bytes to process.
*/
kMaxNumInputBytesToConsider: number;
/**
* Predictions with probability greater than or equal to this threshold are
* marked as reliable. This threshold was optimized on a set of text segments
* extracted from wikipedia, and results in an overall precision, recall,
* and f1 equal to 0.9760, 0.9624, and 0.9692, respectively.
*/
kReliabilityThreshold: number;
/**
* Reliability threshold for the languages hr and bs.
*/
kReliabilityHrBsThreshold: number;
new (minBytes: number, maxBytes: number): NNetLanguageIdentifier;
};
initializeRuntime(): Promise<boolean>;
}