UNPKG

cld3-asm

Version:

WebAssembly based Javascript bindings for google compact language detector 3

155 lines (132 loc) 6.06 kB
import { CldAsmModule, LanguageResult } from './cldAsmModule'; import { CldFactory } from './cldFactory'; import { LanguageCode } from './languageCode'; import { log } from './util/logger'; import { wrapCldInterface } from './wrapCldInterface'; // size of pointer to calculate pointer position. const PTR_SIZE = 4; /** * @internal * Creates a factory function for mounting files into wasm filesystem * and creating language identifier instance. * * @param {CldAsmModule} asmModule wasm / asm module loaded into memory. * * @returns {CldFactory} Factory function manages lifecycle of cld3 language identifier. */ export const cldLoader = (asmModule: CldAsmModule): CldFactory => { const { cwrap, _free, allocateUTF8, _malloc, getValue, UTF8ToString, setValue } = asmModule; const cldInterface = wrapCldInterface(cwrap); /** * Naive auto-dispose interface to call cld interface with string params. * */ const usingParamPtr = <T = void>(...args: Array<string | ((...args: Array<number>) => T)>): T => { const params = [...args]; const fn = params.pop()!; const paramsPtr = params.map((param: string) => allocateUTF8(param)); const ret = (fn as Function)(...paramsPtr); paramsPtr.forEach(paramPtr => _free(paramPtr)); return ret; }; // grab constant values from cld3 library const unknownIdentifier = UTF8ToString(cldInterface.getUnknownIdentifier()); const minBytesDefault = cldInterface.getMinNumBytesDefault(); const maxBytesDefault = cldInterface.getMaxNumBytesDefault(); const maxBytesInput = cldInterface.getMaxNumBytesInput(); const languageResultStructSize = cldInterface.sizeLanguageResult(); log(`cldLoader: cld3 wasm initialized with default values`, { unknownIdentifier, minBytesDefault, maxBytesDefault, maxBytesInput, languageResultStructSize }); // both identifier should match all time, check when initialize binary if (unknownIdentifier !== LanguageCode.UNKNOWN) { throw new Error(`cld3 binary unknownIdentifier constant does not match to LanguageCode enum`); } /** * Wrapper function to read SpanInfo** array from pointer. * After interop, pointer will be freed. */ const volatileReadSpanInfoArray = (arrayPtr: number, size: number) => { const ret = Array.from(new Array(size)).map((_, idx) => { const spanInfoPtr = getValue(arrayPtr + PTR_SIZE * idx, '*'); const range = { start_index: getValue(spanInfoPtr + PTR_SIZE * 0, 'i8'), end_index: getValue(spanInfoPtr + PTR_SIZE * 1, 'i8'), probability: getValue(spanInfoPtr + PTR_SIZE * 2, 'float') }; //free each individual SpanInfo* struct _free(spanInfoPtr); return range; }); //free SpanInfo** array _free(arrayPtr); return ret; }; /** * Wrapper function to read LanguageResult struct from pointer. * After interop, pointer will be freed. * * @param structPtr */ const volatileReadResultStruct = (structPtr: number) => { // get value of first field of LanguageResult struct (char*) const languageStringPtr = getValue(structPtr + PTR_SIZE * 0, '*'); // get ptr to array of byte range with its size const byteRangesSize = getValue(structPtr + PTR_SIZE * 4, 'i8'); const byteRangesArrayPtr = getValue(structPtr + PTR_SIZE * 5, '*'); // be careful to match order of properties to match pointer to struct field. const ret: LanguageResult = { language: UTF8ToString(languageStringPtr) as LanguageCode, probability: getValue(structPtr + PTR_SIZE * 1, 'float'), is_reliable: !!getValue(structPtr + PTR_SIZE * 2, 'i8'), proportion: getValue(structPtr + PTR_SIZE * 3, 'float'), byte_ranges: volatileReadSpanInfoArray(byteRangesArrayPtr, byteRangesSize) }; //free char* for language string _free(languageStringPtr); //free struct _free(structPtr); return ret; }; return { create: (minBytes: number = minBytesDefault, maxBytes: number = maxBytesDefault) => { const cldPtr = cldInterface.create(minBytes, maxBytes); return { findLanguage: (text: string) => { // `findLanguage` requires caller must allocate memory for return value. const resultPtr = _malloc(languageResultStructSize); usingParamPtr(text, textPtr => cldInterface.findLanguage(cldPtr, textPtr, resultPtr)); return volatileReadResultStruct(resultPtr); }, findMostFrequentLanguages: (text: string, numLangs: number) => { // `findMostFrequentLanguages` requires caller must allocate memory for return value. const languageListPtr = _malloc(numLangs * PTR_SIZE); // For convinience, we'll store allocated pointer to each empty LanguageResult for return value const resultStructsPtr: Array<number> = []; //allocate memory in js. `findTopNMostFreqLangs` always returns vector with given num_langs, allows predictable memory allocation. for (let idx = 0; idx < numLangs; idx++) { const resultPtr = _malloc(languageResultStructSize); resultStructsPtr.push(resultPtr); // fill in array with allocated struct ptr setValue(languageListPtr + idx * PTR_SIZE, resultPtr, '*'); } const languageCount = usingParamPtr(text, textPtr => cldInterface.findTopNMostFreqLangs(cldPtr, textPtr, numLangs, languageListPtr) ); // if `numLangs` exceeds number of languages detected rest of array will be filled with default result with unknown language identifier const ret = resultStructsPtr .map(ptr => volatileReadResultStruct(ptr)) .filter(x => x.language !== unknownIdentifier); // each LanguageResult struct is freed via `volatileReadResultStruct` already. delete allocated memory for array itself. _free(languageListPtr); return languageCount > 0 ? ret : []; }, dispose: () => cldInterface.destroy(cldPtr) }; } }; };